In [1]:
print (">> Environment Setup starting...")
from datetime import datetime
import numpy as np                 # linear algebra
import pandas as pd                # Data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as datetime

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot 
init_notebook_mode (connected=True)

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

# Imported for my sanity!
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

>> Environment Setup starting...


/kaggle/input/edsa-movie-recommender-challenge-2022/sample_submission.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/movies.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/imdb_data.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/genome_tags.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/genome_scores.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/train.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/test.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/tags.csv
/kaggle/input/edsa-movie-recommender-challenge-2022/links.csv


In [2]:
movie_ratings = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/train.csv')
# movies_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/movies.csv')
# imdb_data_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/imdb_data.csv')
# genome_tags_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/genome_tags.csv')
# genome_scores_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/genome_scores.csv')
test_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/test.csv')
# tags_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/tags.csv')
# links_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/links.csv')

movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [3]:
# Determine presence of any blanks in the ratings dataframe
print (f'Presence of any Nulls: {movie_ratings.isnull().values.any()}')

# Confirm absence of user-movie duplicate combinations in the ratings dataframe
print (f'There are  {movie_ratings.duplicated(["userId","movieId"]).sum()} user-movie duplicates in the dataset')

Presence of any Nulls: False
There are  0 user-movie duplicates in the dataset


In [4]:
movie_ratings.drop(['timestamp'], axis=1, inplace=True)
movie_ratings.columns = ['userID', 'movieID', 'rating']

In [5]:
print('Dataset shape: {}'.format(movie_ratings.shape))
print('-Dataset examples-')
print(movie_ratings.iloc[::2000000, :])

Dataset shape: (10000038, 3)
-Dataset examples-
          userID  movieID  rating
0           5163    57669     4.0
2000000    76453      597     3.5
4000000    79328     3052     4.0
6000000    44432   111364     5.0
8000000    57569      920     5.0
10000000   89307    96588     1.0


In [6]:
data = movie_ratings['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / movie_ratings.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution of the {} ratings'.format(movie_ratings.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()
print (f'Movie ratings range from {movie_ratings["rating"].min()} to {movie_ratings["rating"].max()} with an average rating of {np.round (np.mean(movie_ratings["rating"]),1)}')

Movie ratings range from 0.5 to 5.0 with an average rating of 3.5


In [7]:
# Number of ratings per user
data = movie_ratings.groupby('userID')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Rating Distribution for Top 50 Frequent Users',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [8]:
# Number of ratings per book
data = movie_ratings.groupby('movieID')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Rating Distribution for Top 50 Popular Movies',
                   xaxis = dict(title = 'Number of Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [9]:
# PARAMETERS
training_ratio     = 0.95   # training portion of input data
training_size      = np.int64(training_ratio * movie_ratings.shape[0]) # total dataset size for training & testing
seed             = 4100   # sampling seed

# Establish the sample from 95% of provided training data
print (">> Data allocation started...")
# Split the dataset into training and validation datasets
split_value = np.int64(training_ratio * training_size)
train_df    = movie_ratings[:split_value]
valid_df    = movie_ratings[split_value:]

print (">> Data allocation completed!")
print (f'The Original dataset has {movie_ratings.shape[0]} ratings with {len(np.unique(movie_ratings["userID"]))} unique users and {len(np.unique(movie_ratings["movieID"]))} unique movies')
print (f'The Training dataset has {train_df.shape[0]} ratings with {len(np.unique(train_df["userID"]))} unique users and {len(np.unique(train_df["movieID"]))} unique movies')
print (f'The Validation dataset has {valid_df.shape[0]} ratings with {len(np.unique(valid_df["userID"]))} unique users and {len(np.unique(valid_df["movieID"]))} unique movies')

>> Data allocation started...
>> Data allocation completed!
The Original dataset has 10000038 ratings with 162541 unique users and 48213 unique movies
The Training dataset has 9025034 ratings with 162541 unique users and 46915 unique movies
The Validation dataset has 975004 ratings with 141473 unique users and 23183 unique movies


In [10]:
# ADJUST THE PARAMETERS TO ACHIEVE THE 180 000 - 220 000 target size
min_movie_ratings = 840
min_user_ratings  = 700

In [11]:
def reduced_dataframe (df):
    
    print (">> Dimensionality reduction starting...")
    filter_items = df['movieID'].value_counts() > min_movie_ratings
    filter_items = filter_items[filter_items].index.tolist()

    filter_users = df['userID'].value_counts() > min_user_ratings
    filter_users = filter_users[filter_users].index.tolist()

    df_new = df[(movie_ratings['movieID'].isin(filter_items)) & (df['userID'].isin(filter_users))]
    print (">> Dimensionality reduction completed...")
    print(f'The original data frame size:{format(df.shape[0])} records')
    print(f'The new data frame size     :{format(df_new.shape[0])} records')
    return df_new


In [12]:
df_new = reduced_dataframe (train_df)

>> Dimensionality reduction starting...
>> Dimensionality reduction completed...
The original data frame size:9025034 records
The new data frame size     :204239 records


In [13]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df_new[['userID', 'movieID', 'rating']], reader)
df_new

Unnamed: 0,userID,movieID,rating
21,95075,41571,3.0
62,75830,2713,4.0
116,24692,588,4.5
140,148291,8810,0.5
242,98873,3755,3.0
...,...,...,...
9024764,124482,2628,3.5
9024909,80587,1025,4.0
9024985,54232,45517,3.5
9025025,81216,72226,3.5


In [14]:
benchmark = []

# Iterate over the chosen algorithms
algorithms = [KNNBaseline(), 
              KNNBasic(), 
              KNNWithMeans(), 
              KNNWithZScore(),
              SVD()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')

Attempting:  [<surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f73b5723d50>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x7f73b57239d0>, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7f73b5723510>, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7f73c80d0150>, <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f73c80d06d0>] 



Starting:  <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f73b5723d50>
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done:  <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f73b5723d50> 


Starting:  <surprise.prediction_algorithms.knns.KNNBasic object at 0x7f73b57239d0>
Computing the msd similarity

In [15]:
Traing_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
Traing_results

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline,0.816078,0.622372,1.176214,15.297447
SVD,0.818151,0.623719,15.60522,0.847092
KNNWithMeans,0.820127,0.626428,0.365285,13.09177
KNNWithZScore,0.822943,0.621021,0.527297,13.868653
KNNBasic,0.887991,0.6775,0.268437,12.119195


In [16]:
chosen_algorithm = KNNBaseline()

In [17]:
# PROVIDE MODEL ALTERNATIVE PARAMETER VALUES FOR COMPARISON
# ... carry out a grid search

def get_best_parms(model, param_grid):
        
    print (">> Parameter search starting...")
    gs = GridSearchCV(chosen_algorithm, param_grid, measures=["rmse", "mae"], refit=True, cv=5)
    gs.fit(data)
    best_parameters = gs.best_params["rmse"]

    print (">> Parameter search completed!")
    print("BEST RMSE  : \t", gs.best_score["rmse"])
    print("BEST MAE   : \t", gs.best_score["mae"])
    print("BEST params: \t", gs.best_params["rmse"])
    return best_parameters

In [18]:
# start = datetime.now()
print (">> Setting up model Parameters...")
reader = Reader(rating_scale=(0.5, 5))

print(">> Loading data...")
data = Dataset.load_from_df(df_new[['userID', 'movieID', 'rating']], reader)
trainset = data.build_full_trainset()

print(">> Training the model...")
chosen_algorithm.fit(trainset)
# end = datetime.now()
print(">> Model Training completed...")

# print(">>The Entire Training Process Took:", (end-start).seconds, "seconds" )

>> Setting up model Parameters...
>> Loading data...
>> Training the model...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
>> Model Training completed...


In [19]:
trainset

<surprise.trainset.Trainset at 0x7f73b5b01fd0>

In [20]:
## SAVE TRAINED MODEL USING THE CHOSEN PARAMETERS
from surprise import dump
import os
model_filename = "./KNNBaselineModel.pickle"
print (">> Starting dump")
# Dump algorithm and reload it.
file_name = os.path.expanduser(model_filename)
dump.dump(file_name, algo=chosen_algorithm)
print (">> Dump done - Model saved!")
print (">> The model name is: ", model_filename)

>> Starting dump
>> Dump done - Model saved!
>> The model name is:  ./KNNBaselineModel.pickle


In [21]:
## LOAD SAVED MODEL
def load_model(model_filename):
    print (">> Loading the model...")
    from surprise import dump
    import os
    file_name = os.path.expanduser(model_filename)
    _, loaded_model = dump.load(file_name)
    print (">> Model has been loaded!")
    return loaded_model

In [22]:
# MAKE PREDICTIONS FOR THE REFERENCE USER and MOVIE
from pprint import pprint as pp
model_filename = "./KNNBaselineModel.pickle"
loaded_model = load_model(model_filename)

def movie_rating(user, item):
    uid = str(user)
    iid = str(item) 
    prediction = loaded_model.predict(user, item, verbose=False)
    rating = prediction.est
    details = prediction.details
    uid = prediction.uid
    iid = prediction.iid
    true = prediction.r_ui
    outcome = {
        'user': user, 
        'item': item, 
        'rating': rating, 
        'details': details,
        'uid': uid,
        'iid': iid,
        'true': true
        }
    return outcome

>> Loading the model...
>> Model has been loaded!


In [23]:
model = load_model(model_filename)
for index, item in valid_df.iterrows():    
    # Identify the reference userid and the movieid
    ref_user  = np.uint(item["userID"])
    ref_movie = np.uint(item["movieID"])

    # Make the prediction for the user-movie combination
    outcome =  movie_rating(ref_user, ref_movie)
    valid_df.at[index, ["prediction"]] = round(outcome['rating'] * 2)/2

# Print output sample
valid_df.loc[::20000,:]

>> Loading the model...
>> Model has been loaded!


Unnamed: 0,userID,movieID,rating,prediction
9025034,25714,5349,4.0,3.5
9045034,116017,5025,4.0,3.0
9065034,47717,3882,5.0,3.0
9085034,98904,2701,3.0,2.0
9105034,45100,175661,4.0,3.5
9125034,54350,2747,1.0,3.0
9145034,135544,3274,4.0,3.0
9165034,61554,610,3.0,3.0
9185034,72315,4309,2.0,3.5
9205034,95514,8949,2.5,3.5


In [24]:
model = load_model(model_filename)
#start = datetime.now()
print(">> Test Predictions started...")

# Open the TEST dataset
testing_df = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/test.csv')
testing_df.columns = ['userID', 'movieID']
print(">> Test file loaded...")

# Create a 'prediction' column and initialize values to ZEROES
testing_df["rating"] = 0.0
predictions = []

# Other: Predict ratings for the whole dataset 
for index, item in testing_df.iterrows():    
    # Make the prediction for the user-movie combination    
    x = (model.predict(item.userID, item.movieID))
    pred = x[3]
    predictions.append(pred)
    
#end = datetime.now()
print(">> Test Predictions completed...")
# print(">> The Entire Prediction Process Took:", (end-start).seconds, "seconds" )

>> Loading the model...
>> Model has been loaded!
>> Test Predictions started...
>> Test file loaded...
>> Test Predictions completed...


In [25]:
testing_df['Id'] = testing_df['userID'].map(str) +'_'+ testing_df['movieID'].map(str)   
results = pd.DataFrame({"Id":testing_df['Id'],"rating": predictions})    
results.to_csv("submission.csv", index=False)

In [26]:
results

Unnamed: 0,Id,rating
0,5_788,2.537538
1,68_7438,3.696888
2,336_40412,3.286329
3,803_3822,3.286329
4,547_903,3.918748
...,...,...
14305,294_30707,3.805837
14306,803_780,3.135271
14307,519_912,4.120178
14308,628_6764,3.286329
