In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import math
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import os

In [4]:
conda install -c conda-forge scikit-surprise

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\KIIT\anaconda3

  added / updated specs:
    - scikit-surprise


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.7.4   |       h56e8100_0         151 KB  conda-forge
    certifi-2024.7.4           |     pyhd8ed1ab_0         156 KB  conda-forge
    openssl-3.3.1              |       h2466b09_2         8.0 MB  conda-forge
    python_abi-3.11            |          2_cp311           5 KB  conda-forge
    scikit-surprise-1.1.4      |  py311h814a670_1         543 KB  conda-forge
    ucrt-10.0.22621.0          |       h57928b3_0         1.2 MB  conda-forge
    vc14_runtime-14.40.33810   |      ha82c5b3_20         734 KB  conda-forge
    vs2015_runtime-14.40.33810 |      h3



  current version: 23.7.4
  latest version: 24.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.5.0




In [8]:
# Loading Movie Title Dataset
data = pd.read_csv('movie_titles.csv',encoding='ISO-8859-1', header=None, usecols=[0,1,2], names=['Movie_Id','Year','Name' ])
df_title = pd.DataFrame(data)
df_title.set_index('Movie_Id', inplace = True)
df_title.head(5)

Unnamed: 0_level_0,Year,Name
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [11]:
df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Movie_Id  17770 non-null  int64  
 1   Year      17763 non-null  float64
 2   Name      17770 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 416.6+ KB


In [12]:
df.isna().sum()

Movie_Id    0
Year        7
Name        0
dtype: int64

In [17]:
# Loading Rating Dataset
data = pd.read_csv('combined_data_1.txt',header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1]) 
df = pd.DataFrame(data)

In [19]:
df.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [21]:
# Creating Dataframe with all movie id rows
#cleaning data
df_nan=pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating']==True].reset_index()
df_nan

Unnamed: 0,index,Rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True
...,...,...
1897,9644878,True
1898,9645060,True
1899,9645137,True
1900,9645262,True


In [23]:
# Assigning the moveid to relevant index
df['Movie_Id'] = 0
for i in range(1,len(df_nan)):    
    prev_index = df_nan['index'].iloc[i-1]
    curr_index = df_nan['index'].iloc[i]
    df['Movie_Id'].iloc[prev_index:curr_index] = i   
    #print(i   ,prev_index,curr_index)
df.info

<bound method DataFrame.info of          Cust_Id  Rating  Movie_Id
0             1:     NaN         1
1        1488844     3.0         1
2         822109     5.0         1
3         885013     4.0         1
4          30878     4.0         1
...          ...     ...       ...
9682426  1911337     1.0         0
9682427  1186037     1.0         0
9682428   239928     3.0         0
9682429   428558     4.0         0
9682430  1517327     5.0         0

[9682431 rows x 3 columns]>

In [25]:
df.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
0,1:,,1
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1


In [27]:
# Removing the inactive records by elminating records < 80 percentile
f = ['count','mean']
for col in ['Cust_Id','Movie_Id']:
    df_cnt = df.groupby(col)['Rating'].agg(f)
    df_cnt.index = df_cnt.index.map(int)
    threshold_val = round(df_cnt['count'].quantile(.8))
    if col == 'Cust_Id':
        df_cust = df_cnt
        cust_drop_id = df_cust [ df_cust['count'] < threshold_val ].index         
    else:
        df_movie = df_cnt
        movie_drop_id = df_movie [ df_movie['count'] < threshold_val ].index      
    print(f'The threshold Value for {col} :',threshold_val)

ValueError: invalid literal for int() with base 10: '1000:'

In [30]:
f = ['count', 'mean']

# Initialize cust_drop_id and movie_drop_id as empty lists
cust_drop_id = []
movie_drop_id = []

for col in ['Cust_Id', 'Movie_Id']:
    df_cnt = df.groupby(col)['Rating'].agg(f)
    
    # Check if the index can be converted to integers
    try:
        df_cnt.index = df_cnt.index.map(int)
    except ValueError as e:
        print(f"Error converting index to integers for column {col}: {e}")
        continue
    
    threshold_val = round(df_cnt['count'].quantile(.8))
    
    if col == 'Cust_Id':
        df_cust = df_cnt
        cust_drop_id = df_cust[df_cust['count'] < threshold_val].index
    else:
        df_movie = df_cnt
        movie_drop_id = df_movie[df_movie['count'] < threshold_val].index
    
    print(f'The threshold Value for {col}:', threshold_val)

print('Original Shape: {}'.format(df.shape))
df = df[~df['Cust_Id'].isin(cust_drop_id)]    
df = df[~df['Movie_Id'].isin(movie_drop_id)]
print('After Trim Shape: {}'.format(df.shape))


Error converting index to integers for column Cust_Id: invalid literal for int() with base 10: '1000:'
The threshold Value for Movie_Id: 3898
Original Shape: (9682431, 3)
After Trim Shape: (8551314, 3)


In [31]:
df_title

Unnamed: 0_level_0,Year,Name
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW
...,...,...
17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17767,2004.0,Fidel Castro: American Experience
17768,2000.0,Epoch
17769,2003.0,The Company


In [33]:
# Reader to read the data
reader = Reader()
# Taking only 100000 for modeling 
data=Dataset.load_from_df(df[['Cust_Id','Movie_Id','Rating']][:100000], reader)
# Initiate the Model
model_svd = SVD()
# Check Cross Validation
cross_validate(model_svd,data, measures=['RMSE','MAE'],verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    nan     nan     nan     nan     nan     nan     nan     
MAE (testset)     nan     nan     nan     nan     nan     nan     nan     
Fit time          0.90    1.04    1.07    1.02    1.09    1.02    0.07    
Test time         0.11    0.10    0.10    0.10    0.11    0.10    0.00    


{'test_rmse': array([nan, nan, nan, nan, nan]),
 'test_mae': array([nan, nan, nan, nan, nan]),
 'fit_time': (0.9048774242401123,
  1.038956642150879,
  1.0654404163360596,
  1.020876407623291,
  1.094811201095581),
 'test_time': (0.11168122291564941,
  0.09932804107666016,
  0.10228919982910156,
  0.09769034385681152,
  0.10569047927856445)}

In [34]:
df_user_785314_liked = df[(df['Cust_Id'] == 785314) & (df['Rating'] == 5)]
df_user_785314_liked = df_user_785314_liked.set_index('Movie_Id')
df_user_785314_liked = df_user_785314_liked.join(df_title)['Name']
print(df_user_785314_liked)

Series([], Name: Name, dtype: object)


In [37]:
#Create copy of the title data
#Reset index to its original
#Drop the movie id for which less rating was given
df_user_785314_recomended = df_title.copy()
df_user_785314_recomended = df_user_785314_recomended.reset_index()
df_user_785314_recomended = df_user_785314_recomended[~df_user_785314_recomended['Movie_Id'].isin(movie_drop_id)]
full_data = Dataset.load_from_df(df[['Cust_Id','Movie_Id','Rating']],reader)
train_data = full_data.build_full_trainset()
model_svd.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16064daae90>

In [38]:

df_user_785314_recomended['Estimate_score'] = df_user_785314_recomended['Movie_Id'].apply(lambda x: model_svd.predict(785314,x).est)
df_user_785314_recomended.sort_values('Estimate_score', ascending=False)

# Creating Sparse matrix with user and rating
df_sparse_mat = pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id')
print(df_sparse_mat.shape)
df_sparse_mat



(440828, 381)


Movie_Id,0,8,17,18,26,28,30,33,44,46,...,1860,1861,1862,1865,1866,1867,1877,1884,1890,1901
Cust_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,,,,,,,,,,,...,,,,,,,,,,
1000027,,,,,,,,,,,...,,,,,,,,,,
1000033,,,,,,,4.0,,,,...,,,,3.0,,,,,,
1000035,,,,,,,4.0,,,,...,,,,,,,,,,
1000038,,,,,,,5.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999964,,,,,,,,,,,...,,,,,,,4.0,,,
999972,,,,,,,,,,,...,,,,,,,4.0,,,
999977,,,,,,,,,,,...,,,,,,,,,,
999984,,,,,,,,,,,...,,,,,,,,,,


In [39]:
def recommend_movie(movie_title):
    i = int(df_title.index[df_title['Name']==movie_title][0])
    y = df_sparse_mat[i]
    corr_y = df_sparse_mat.corrwith(y) #applying correlation
    df_recommend_movie = pd.DataFrame(corr_y,columns=['Pearson_R'])
    df_recommend_movie.dropna(inplace=True)
    df_recommend_movie = df_recommend_movie.sort_values('Pearson_R',ascending=False)
    df_recommend_movie.index = df_recommend_movie.index.map(int)
    df_recommend_movie = df_recommend_movie.join(df_title).join(df_movie)[['Pearson_R', 'Name', 'count', 'mean']] 
    print(df_recommend_movie[df_recommend_movie['count']>0][:10].to_string(index=False))



# Similar movies for The 10th Kingdom
recommend_movie("The Twilight Samurai")

 Pearson_R                                         Name  count     mean
  1.000000                         The Twilight Samurai   5098 4.029619
  0.520282                             The 10th Kingdom   4532 3.669903
  0.490830                 Murder on the Orient Express   4785 3.743783
  0.485804                                    The Women   4083 3.953466
  0.479132 Saturday Night Live: The Best of Dana Carvey   4495 3.713459
  0.474916                   The Shop Around the Corner   4543 3.801673
  0.468203                        The Battle of Algiers   5945 4.123802
  0.451743                                   The Mighty   4766 3.744440
  0.446952                Absolutely Fabulous: Series 5   4580 4.154367
  0.431463                         A Night at the Opera   5050 4.067921
