## Unsupervised Movie Recommendation Predict ##

###    Import libraries

In [13]:
# Import our regular old heroes 
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy. 
import matplotlib.pyplot as plt
import seaborn as sns

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import issparse
from scipy.sparse import csr_matrix
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

### import data

In [4]:
genome_scores = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/genome_scores.csv')
genome_tags = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/genome_tags.csv')
imdb_data = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/imdb_data.csv')
links = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/links.csv')
movies = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/movies.csv')
tags = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/tags.csv')
train = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/train.csv')
test = pd.read_csv('C:/Users/hp/Documents/Unsupervised learning/predict/ea-movie-recommendation-predict-2023-2024/test.csv')

### checking the dataframes

In [12]:
genome_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [14]:
genome_tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [15]:
imdb_data.head()

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


In [16]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [17]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [110]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [123]:
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [5]:
# check the shape after joining
print('Shape genome_scores: ',genome_scores.shape)
print('Shape genome_tags: ',genome_tags.shape)
print('Shape imdb_data: ',imdb_data.shape)
print('Shape links: ',links.shape)
print('Shape movies: ',movies.shape)
print('Shape tags: ',tags.shape)
print('Shape train: ',train.shape)
print('Shape test: ',test.shape)

Shape genome_scores:  (15584448, 3)
Shape genome_tags:  (1128, 2)
Shape imdb_data:  (27278, 6)
Shape links:  (62423, 3)
Shape movies:  (62423, 3)
Shape tags:  (1093360, 4)
Shape train:  (10000038, 4)
Shape test:  (5000019, 2)


In [7]:
#!pip install scikit-surprise[alldeps]

### Create and run SVD model

In [9]:
# Create a Surprise Reader
reader = Reader(rating_scale=(0, 5))  # Adjust the rating scale as needed

# Load training data into Surprise Dataset
train_data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)

# Load test data into Surprise Dataset
test_data = Dataset.load_from_df(test[['userId', 'movieId']].assign(rating=0), reader)

In [10]:
# Split the training data into train and validation sets
trainset, validationset = train_test_split(train_data, test_size=0.2, random_state=42)

In [11]:
# Create SVD model
model = SVD()

# Train the model on the training set
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16cae3e3280>

### Test predictions

In [14]:
# Get predictions on the validation set
predictions = model.test(validationset)

# Evaluate the model
rmse = accuracy.rmse(predictions)
print(f'RMSE on validation set: {rmse}')

RMSE: 0.8331
RMSE on validation set: 0.8331194293123103


### Predict Rating on actual data

In [39]:
# Create a DataFrame for predictions
test_predictions_df = test[['userId', 'movieId']]

# Use the model to predict ratings
test_predictions_df['predicted_rating'] = test_predictions_df.apply(lambda row: model.predict(row['userId'], row['movieId']).est, axis=1)

In [40]:
# Check test predictions dataframe
test_predictions_df.head()

Unnamed: 0,userId,movieId,predicted_rating
0,1,2011,3.212043
1,1,4144,4.218858
2,1,5767,3.406486
3,1,6711,4.224111
4,1,7318,2.929928


### Alter predictions layout for submission

In [44]:
# create new dataframe to modify layout 
test_predictions = test_predictions_df

In [45]:
#join userid and movieid with '_'
test_predictions['Id'] = test_predictions['userId'].astype(str).str.cat(test_predictions['movieId'].astype(str), sep='_')

In [46]:
# new field for rating 
test_predictions['rating'] = test_predictions_df['predicted_rating'].round(1)

In [47]:
# check new dataframe
test_predictions.head()

Unnamed: 0,userId,movieId,predicted_rating,Id,rating
0,1,2011,3.212043,1_2011,3.2
1,1,4144,4.218858,1_4144,4.2
2,1,5767,3.406486,1_5767,3.4
3,1,6711,4.224111,1_6711,4.2
4,1,7318,2.929928,1_7318,2.9


In [48]:
# Drop unnescessary columns
columns_to_drop = ['userId', 'movieId','predicted_rating']
test_predictions.drop(columns=columns_to_drop, inplace=True)
test_predictions.reset_index(drop=True, inplace=True)

In [49]:
# check data
test_predictions.head()

Unnamed: 0,Id,rating
0,1_2011,3.2
1,1_4144,4.2
2,1_5767,3.4
3,1_6711,4.2
4,1_7318,2.9


### Submission CSV

In [50]:
# Generate submission CSV
test_predictions.to_csv('Sub_rating.csv', index=False)