In [1]:
from final.cleaning import import_dataset, clean_whole_df, filter_df, clean_test_df, mongo_connect
from final.model_processing import partition, run_model
from final.dashboard import rate

In [2]:
host = 'localhost'
port = 27017
db_name = 'Movielens'

mongo_connect(host, port, db_name)

df = import_dataset(host, port, db_name)
cleaned_df = clean_whole_df(df)

In [3]:
%%time

filtering_opts = {
    "min_mean_rating": 1.5,
    "max_mean_rating": 4.5,
    "movies_threshold": 35,
    "movies_few_notes": True,
    "users_threshold": 45,
    "users_few_notes": True,
    "users_no_discriminating": True,
    "users_constant_dt": True,
}

filtered_df = filter_df(cleaned_df, **filtering_opts)

Nombre de ratings par utilisateur :
count    6040.000000
mean      163.412417
std       188.350206
min        16.000000
25%        44.000000
50%        95.000000
75%       204.000000
max      1999.000000
Name: count, dtype: float64


CPU times: user 811 ms, sys: 81.1 ms, total: 892 ms
Wall time: 879 ms


In [4]:
partition_opts = {"test_size": 0.8, "mini_size": 0.03}

train_df, test_df, train_mini, test_mini = partition(filtered_df, partition_opts)
train_df.shape, test_df.shape

((42251, 4), (169007, 4))

In [5]:
opts = {
    "n_components": 10,
    "max_iter": 200,
    "normalize": {"should": True, "min": 1, "max": 5},
}

model, predict_matrix = run_model(train_df, opts)
model, predict_matrix.shape



(NMF(n_components=10), (10714678, 3))

In [6]:
cleaned_test_df = clean_test_df(train_df, test_df)
cleaned_test_df

Unnamed: 0,user_id,movie_id,rating,timestamp
454968,3363,356,4,978225919
398483,3658,943,5,971900358
698363,1793,3477,3,974760408
762117,1447,293,4,981018755
105125,5406,3109,3,960250827
...,...,...,...,...
779887,1335,3529,2,974818265
855859,930,1213,4,975190023
858012,918,491,4,978241669
176691,4950,3155,2,983223331


In [7]:
%%time

options = {
    "mse": True,
    "top_10": True,
    "bottom_10": True,
    "ndcg" : True
}


rating_train = rate(predict_matrix, train_df, options)
rating_test = rate(predict_matrix, cleaned_test_df, options)
rating_train, rating_test

CPU times: user 1.87 s, sys: 300 ms, total: 2.17 s
Wall time: 1.58 s


([(2.715673643865076, 7.374883339983421),
  1.3320849892999083,
  1.6630999694283095,
  0.9697157980818341],
 [(2.701810534338826, 7.299780163464252),
  1.2966311731228197,
  1.6982283825959246,
  0.9736387246349023])