In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

#### Loading Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
train.shape

(10000038, 4)

In [4]:
# Data cleaning
train.dropna(subset=['userId', 'movieId', 'rating'], inplace=True)  
train['userId'] = train['userId'].astype(int)  
train['movieId'] = train['movieId'].astype(int)  
train['rating'] = train['rating'].astype(float) 

In [5]:
# Check for duplicate entries
train.drop_duplicates(subset=['userId', 'movieId'], inplace=True)

In [14]:
train = train[(train['rating'] >= 0.5) & (train['rating'] <= 5)]
train.shape


(10000038, 4)

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train['rating_normalized'] = scaler.fit_transform(train[['rating']])

In [16]:
min_user_ratings = 5
min_movie_ratings = 5

filtered_users = train['userId'].value_counts()[train['userId'].value_counts() >= min_user_ratings].index
filtered_movies = train['movieId'].value_counts()[train['movieId'].value_counts() >= min_movie_ratings].index

train = train[train['userId'].isin(filtered_users) & train['movieId'].isin(filtered_movies)]


In [18]:
# Prepare the training data for the Surprise library
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)

In [19]:
trainset, validationset = train_test_split(data, test_size=0.15)

#### Side Note

In [22]:
from surprise.model_selection import GridSearchCV

In [23]:
param_grid = {
    'n_factors': [20, 50, 100, 200],  
    'n_epochs': [20, 30, 40, 50],     
    'lr_all': [0.002, 0.005, 0.01, 0.02],  
    'reg_all': [0.02, 0.05, 0.1, 0.2]   
}

In [24]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

In [26]:
gs.fit(data)

In [None]:
# Best RMSE score
print("Best RMSE score achieved: ", gs.best_score['rmse'])

# Best parameters 
print("Best parameters: ", gs.best_params['rmse'])

# best SVD model trained on the whole dataset
best_algo = gs.best_estimator['rmse']

Best RMSE score achieved:  1.0731908336346934
Best parameters:  {'n_factors': 200, 'n_epochs': 50, 'lr_all': 0.002, 'reg_all': 0.02}


#### Model Building

In [20]:
# Train the SVD algorithm on the trainset
algo = SVD()
algo.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24787be48d0>

In [21]:
# Predict ratings for the validation set
predictions = algo.test(validationset)

# Calculate and print RMSE
rmse = accuracy.rmse(predictions)
print(f'Validation RMSE: {rmse}')


RMSE: 0.8296
Validation RMSE: 0.8296136971539708


In [80]:
# Train the SVD algorithm on the full trainset
final_trainset = data.build_full_trainset()
algo.fit(final_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x268f25f02d0>

In [81]:
# Predict ratings for the test dataset
test['predicted_rating'] = test.apply(lambda x: algo.predict(x['userId'], x['movieId']).est, axis=1)

In [87]:
def unnormalize(pred, min_val= 0.5, max_val=5):
    return pred * (max_val - min_val) + min_val

In [90]:
test['predicted_rating'] = test['predicted_rating'].apply(unnormalize)

In [91]:
test

Unnamed: 0,userId,movieId,predicted_rating
0,1,2011,2.937115
1,1,4144,4.071524
2,1,5767,3.222649
3,1,6711,3.322674
4,1,7318,2.750000
...,...,...,...
5000014,162541,4079,3.124203
5000015,162541,4467,3.515611
5000016,162541,4980,2.907049
5000017,162541,5689,2.779589


#### Creating Submission files

In [None]:
# Prepare the final submission
test['userid_movieid'] = test['userId'].astype(str) + '_' + test['movieId'].astype(str)
final_submission = test[['userid_movieid', 'predicted_rating']]

In [None]:
final_submission.head()

In [None]:
final_submission.to_csv('submission.csv', index=False)

In [None]:
e.head()

#### Submission 2

In [92]:
test.head(3)

Unnamed: 0,userId,movieId,predicted_rating
0,1,2011,2.937115
1,1,4144,4.071524
2,1,5767,3.222649


In [93]:
submission = []

for index, row in test.iterrows():
    user = row['userId']
    movie_id = row['movieId']
    rating = row['predicted_rating']
    submission.append({'Id': f"{user}_{movie_id}", 'rating': rating})

In [94]:
# Convert to DataFrame
submission_df = pd.DataFrame(submission)

# Save to CSV
submission_df.to_csv('Ssubmission2.csv', index=False)

# Example output:
print(submission_df.head(11))

            Id    rating
0   1.0_2011.0  2.937115
1   1.0_4144.0  4.071524
2   1.0_5767.0  3.222649
3   1.0_6711.0  3.322674
4   1.0_7318.0  2.750000
5   1.0_8405.0  3.787938
6   1.0_8786.0  3.623284
7    2.0_150.0  3.461389
8    2.0_356.0  3.798988
9    2.0_497.0  3.729651
10   2.0_588.0  3.136394


In [100]:
d = pd.read_csv('Ssubmission2.csv')
d

Unnamed: 0,Id,rating
0,1_2011,2.937115
1,1_4144,4.071524
2,1_5767,3.222649
3,1_6711,3.322674
4,1_7318,2.750000
...,...,...
5000014,162541_4079,3.124203
5000015,162541_4467,3.515611
5000016,162541_4980,2.907049
5000017,162541_5689,2.779589


In [None]:
f = d = pd.read_csv('sample_submission.csv')
d.info()

In [96]:
def convert_string(value):
    return value.replace('.0', '')

In [97]:
d['Id'] = d['Id'].apply(convert_string)

In [99]:
d.to_csv('Ssubmission2.csv', index=False)