In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV
from copy import deepcopy
import pickle

In [2]:
train = pd.read_csv('data/csv/train', index_col=0)
test = pd.read_csv('data/csv/test', index_col=0)
validate = pd.read_csv('data/csv/validate', index_col=0)

grid_set = pd.concat([validate, train])

In [3]:
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(grid_set[['srch_id', 'prop_id', 'target']], reader)

In [4]:
%time
param_grid = {
    "n_epochs": [5, 10, 20],
    "n_factors": [50, 100, 150],
    "lr_all": [0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3,)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


In [5]:
%time
gs.fit(data)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


In [22]:
best_rmse = gs.best_estimator['rmse']
best_mae = gs.best_estimator['mae']

In [28]:
with open('data/submission_df_preprocessed.pickle', 'rb') as file_contents:
    test_sub = pickle.load(file_contents)
test_sub = test_sub[['srch_id', 'prop_id']]

In [70]:
deep_rmse = deepcopy(gs.best_estimator['rmse'])

deep_rmse.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17d51e310>

In [140]:
predictions_best_rmse = []
%time
for i in test_sub.index:
    row = test_sub.iloc[i]
    uid = row['srch_id']
    iid = row['prop_id']   
    est = deep_rmse.predict(uid, iid).est
    predictions_best_rmse.append([uid, iid, est])
    if i % 1_000_000 == 0:
        print(i)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.2 µs
0
1000000
2000000
3000000
4000000


In [142]:
with open('SVD_predictions.csv', 'w') as out_file:
    out_file.write("SearchId,PropertyId,Estimated\n")
    for a, b, c in ppp:
        out_file.write(f"{a}, {b}, {c}\n")

In [143]:
prediction_df = pd.read_csv('SVD_predictions.csv')

In [147]:
prediction_df.sort_values(by=['SearchId', 'Estimated'], ascending=[True, False], inplace=True)

In [166]:
prediction_df = prediction_df[['SearchId', 'PropertyId']]
prediction_df.reset_index(inplace=True)
prediction_df.to_csv('submission_13.csv', index=False)

In [170]:
with open('pickled_models/SVD_gridsearch_cv.pickle', 'wb') as output_file:
    pickle.dump(gs, )

In [176]:
gs

<surprise.model_selection.search.GridSearchCV at 0x11b044640>

In [175]:
!mkdir pickled_models