In [1]:
pip install numpy pandas scikit-surprise

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import KFold,cross_validate, train_test_split

In [3]:
train_data = pd.read_csv('training.txt', names=['user', 'item', 'rating'],header=None, sep=',')

In [4]:
train_data['rating'] = train_data['rating'].astype('float16')

In [5]:
train_data.head()

Unnamed: 0,user,item,rating
0,1,1,4.0
1,2,2,2.5
2,4,4,3.0
3,5,5,5.0
4,6,6,3.5


In [6]:
train_data.shape

(19996916, 3)

train the model

In [7]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(train_data, reader)

In [8]:
# Using SVD
model = SVD()

In [9]:
# Perform 10-fold cross-validation
cross_val_results = cross_validate(model, data, measures=['RMSE'], cv=10, verbose=True)
print("Mean RMSE: ", sum(cross_val_results['test_rmse']) / len(cross_val_results['test_rmse']))

Evaluating RMSE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.7828  0.7835  0.7845  0.7841  0.7836  0.7840  0.7832  0.7841  0.7839  0.7835  0.7837  0.0005  
Fit time          162.57  165.27  166.79  166.44  187.66  188.58  184.89  171.47  179.41  189.41  176.25  10.28   
Test time         23.81   17.97   22.04   20.56   26.18   23.58   24.02   19.97   25.29   21.80   22.52   2.41    
Mean RMSE:  0.7837187074824679


In [10]:
# Train on the full dataset
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1303818c710>

In [11]:
# Load the testing data
test_data = pd.read_csv('testing.txt', names=['user', 'item', 'rating'], sep=',')
test_data.head()

Unnamed: 0,user,item,rating
0,3,3,?
1,9,9,?
2,10,10,?
3,11,11,?
4,19,19,?


In [12]:
test_data.shape

(5003179, 3)

In [13]:
predictions = []
for index, row in test_data.iterrows():
    pred = model.predict(row['user'], row['item']).est
    predictions.append(pred)

In [20]:
test_data['rating'] = predictions
test_data['rating'] = round(test_data['rating'],4)

In [21]:
test_data

Unnamed: 0,user,item,rating
0,3,3,3.7034
1,9,9,4.3248
2,10,10,2.6940
3,11,11,3.3719
4,19,19,4.0275
...,...,...,...
5003174,8474,2950,3.1654
5003175,7921,5199,2.3255
5003176,10010,808,2.5379
5003177,5986,1887,2.8306


In [22]:
# Save the predictions
test_data.to_csv('prediction.txt', index=False, header=False, sep=',')

In [None]:
#training part of svd runtime is around 45 minutes