## Recommendation System

### Important Libaries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

### Data Loading

In [5]:
import os
import pandas as pd

# Print the current working directory
print("Current working directory:", os.getcwd())

# List the files in the Desktop directory
desktop_path = os.path.expanduser("~/Desktop")
print("Files in Desktop directory:", os.listdir(desktop_path))

# Attempt to read the CSV file with the absolute path
csv_file_path = os.path.join(desktop_path, 'rating.csv')

try:
    df = pd.read_csv("Desktop/rating.csv")
    print("CSV file loaded successfully.")
except FileNotFoundError as e:
    print(e)
df

Current working directory: C:\Users\1\Desktop
Files in Desktop directory: ['.ipynb_checkpoints', 'bestvalue.keras', 'category_tree.csv.csv', 'Churn_Modelling.csv', 'CodeBlocks.lnk', 'Desktop files', 'desktop.ini', 'events.csv', 'Image_Recognition_for_Tech_Products.ipynb', 'item_properties_part1.csv', 'item_properties_part2.csv', 'LoanDataset - LoansDatasest.csv', 'movie.csv', 'Personalized_Marketing_in_E-Commerce.ipynb', 'rating.csv', 'Recommendation_Systems.ipynb', 'seg_pred', 'seg_test', 'seg_train', 'Sentiment_Analysis_for_Brand_Monitoring.ipynb', 'stock_list.csv', 'stock_prices.csv', 'training.1600000.processed.noemoticon.csv', 'Transactions Data.csv']
[Errno 2] No such file or directory: 'Desktop/rating.csv'


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


### EDA

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB


In [7]:
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


In [9]:
df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [10]:
df1 = df[['userId', 'movieId', 'rating']]

### Rating Count

In [11]:
df1 = df[['userId', 'movieId', 'rating']]
df1_rating_tm = pd.DataFrame(df1.groupby('rating').size(), columns = ['count'])
df1_rating_tm

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
0.5,239125
1.0,680732
1.5,279252
2.0,1430997
2.5,883398
3.0,4291193
3.5,2200156
4.0,5561926
4.5,1534824
5.0,2898660


In [12]:
df1.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

In [13]:
num_of_user = len(df1['userId'].unique())
num_of_movie = len(df1['movieId'].unique())

print(f'There are {num_of_user} users and {num_of_movie} movies')
print(f"Min value of 'userId': {df1['userId'].min()} and Max value of 'userId': {df1['userId'].max()}")
print(f"Min value of 'movieId': {df1['movieId'].min()} and Max value of 'movieId': {df1['movieId'].max()}")
print(f"Min value of 'rating': {df1['rating'].min()} and Max value of 'rating': {df1['rating'].max()}")

There are 138493 users and 26744 movies
Min value of 'userId': 1 and Max value of 'userId': 138493
Min value of 'movieId': 1 and Max value of 'movieId': 131262
Min value of 'rating': 0.5 and Max value of 'rating': 5.0


In [14]:
df.movieId.values

array([    2,    29,    32, ..., 69644, 70286, 71619], dtype=int64)

### Shrinking Dataset

In [15]:
def shrinking_data(no_of_top_users):
    user_likes = df1.groupby('userId')['movieId'].count()
    top_users = user_likes.nlargest(no_of_top_users).index
    print(f"Top Users Index Values: {top_users}")
    top_users_df = df1[df1['userId'].isin(top_users)].reset_index(drop=True)
    print(f'Top {no_of_top_users} Dataframe')
    return top_users_df

In [16]:
no_of_top_users = 10000
final_top_users_df = shrinking_data(no_of_top_users)
final_top_users_df 

Top Users Index Values: Index([118205,   8405,  82418, 121535, 125794,  74142,  34576, 131904,  83090,
        59477,
       ...
        92433,  95536, 102921, 103578, 106776, 108493, 110559, 115351, 116360,
       117537],
      dtype='int64', name='userId', length=10000)
Top 10000 Dataframe


Unnamed: 0,userId,movieId,rating
0,11,1,4.5
1,11,10,2.5
2,11,19,3.5
3,11,32,5.0
4,11,39,4.5
...,...,...,...
7718626,138474,5401,1.0
7718627,138474,5449,4.0
7718628,138474,5459,4.0
7718629,138474,5460,5.0


In [17]:
final_top_users_df.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

In [18]:
final_top_users_df[['userId', 'movieId']]

Unnamed: 0,userId,movieId
0,11,1
1,11,10
2,11,19
3,11,32
4,11,39
...,...,...
7718626,138474,5401
7718627,138474,5449
7718628,138474,5459
7718629,138474,5460


### Data Splitting

In [19]:
# Split only the features
features_train, features_test, target_train, target_test = train_test_split(final_top_users_df[['userId', 'movieId']],final_top_users_df['rating'], test_size=0.2, random_state=42, stratify = final_top_users_df['rating'])

df_shrink_trainset = pd.concat([features_train, target_train], axis = 1).reset_index(drop=True)
df_shrink_testset = pd.concat([features_test, target_test], axis = 1).reset_index(drop=True)

In [20]:
df_shrink_trainset.shape

(6174904, 3)

In [21]:
df_shrink_testset.shape

(1543727, 3)

In [22]:
df_shrink_trainset[df_shrink_trainset['userId'] == 11]
df_shrink_testset[df_shrink_testset['userId'] == 11]

Unnamed: 0,userId,movieId,rating
15147,11,6377,5.0
55790,11,3114,5.0
97041,11,6264,2.5
103552,11,8578,2.0
115350,11,410,4.0
...,...,...,...
1473170,11,8865,4.0
1490085,11,47124,3.5
1504692,11,2012,5.0
1527450,11,384,3.5


### Data Mapping

In [23]:
def create_mappings(dataframe_name):
    # user to movie mapping
    user2movie = dataframe_name.groupby('userId')['movieId'].unique().to_dict()

    # movie to user mapping
    movie2user = dataframe_name.groupby('movieId')['userId'].unique().to_dict()

    return user2movie, movie2user

user2movie, movie2user = create_mappings(df_shrink_trainset)

In [24]:
len(user2movie)

10000

In [25]:
def usermovierating_mappings(dataframe_name):
    # start time
    start_time = time.time() 
    # User-Movie to Rating Mapping
    usermovie2rating = dataframe_name.pivot_table(index='userId', columns='movieId', values='rating')
    
    # Fill NaN values with 0
    usermovie2rating.fillna(0, inplace=True)
    
    # end time
    end_time = time.time() 
    
    # Calculate the processing time
    processing_time = end_time - start_time  
    print(f"Processing time: {processing_time} seconds")

    return usermovie2rating

### Training Data Matrix

In [26]:
usermovie_to_rating_train = usermovierating_mappings(df_shrink_trainset)
usermovie_to_rating_train

Processing time: 66.90573573112488 seconds


movieId,1,2,3,4,5,6,7,8,9,10,...,131172,131176,131180,131231,131239,131243,131248,131258,131260,131262
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,4.0,0.0,0.0,0.0,2.0,4.0,3.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,4.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,0.0,3.5,3.0,0.0,0.0,0.0,2.5,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138404,5.0,2.5,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138406,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138411,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138437,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Data Matrix

In [27]:
usermovie_to_rating_test = usermovierating_mappings(df_shrink_testset)
usermovie_to_rating_test

Processing time: 18.349979877471924 seconds


movieId,1,2,3,4,5,6,7,8,9,10,...,131158,131164,131166,131168,131174,131241,131250,131252,131254,131256
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138406,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138411,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138437,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Build Model

In [28]:
# Define a KNN model on cosine similarity
cf_knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)


# Fitting the model on our matrix
cf_knn_model.fit(usermovie_to_rating_train)

### Load Movie Metadata

In [38]:
import pandas as pd

# File name on the Desktop
csv_file_name = 'movie.csv'

# Try to read the CSV file
try:
    movie_metadata = pd.read_csv(csv_file_name)
    print("CSV file loaded successfully.")
    print(movie_metadata.head())
except FileNotFoundError as e:
    print(e)


CSV file loaded successfully.
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


### Traindataset After Shrinking

In [39]:
df_shrink_trainset

Unnamed: 0,userId,movieId,rating
0,11924,34405,2.0
1,89799,61236,4.0
2,1864,2448,3.0
3,113129,8961,5.0
4,8966,2423,3.0
...,...,...,...
6174899,347,6377,3.5
6174900,70945,3317,5.0
6174901,106894,1527,4.0
6174902,43679,3948,2.0


### Datasets Merging

In [40]:
def datamerge(df1, df2):
 
    merging = df1.merge(df2, on='movieId')
    
    return merging

In [41]:
movie_data = datamerge(df_shrink_trainset, movie_metadata)
movie_data.head(5)

Unnamed: 0,userId,movieId,rating,title,genres
0,11924,34405,2.0,Serenity (2005),Action|Adventure|Sci-Fi
1,118372,34405,4.5,Serenity (2005),Action|Adventure|Sci-Fi
2,38669,34405,4.0,Serenity (2005),Action|Adventure|Sci-Fi
3,20158,34405,4.5,Serenity (2005),Action|Adventure|Sci-Fi
4,100804,34405,3.5,Serenity (2005),Action|Adventure|Sci-Fi


In [42]:
movie_data.shape

(6174904, 5)

### Load Dataset for Movie Title and Genres

In [50]:
import pandas as pd
import os

# Verify the current working directory
print("Current working directory:", os.getcwd())

# Verify that the file exists
file_exists = os.path.isfile('movie.csv')
print("Does 'movie.csv' exist in the current directory?", file_exists)

# Try to read the CSV file
try:
    movie_metadata = pd.read_csv('movie.csv')
    print("CSV file loaded successfully.")
    print(movie_metadata.head())
except FileNotFoundError as e:
    print(e)


Current working directory: C:\Users\1\Desktop
Does 'movie.csv' exist in the current directory? True
CSV file loaded successfully.
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


### movie recommendations system Based On Nearest Neighbors

In [51]:
usermovie_to_rating_train.T

userId,11,24,54,58,91,96,104,116,131,134,...,138317,138325,138335,138382,138397,138404,138406,138411,138437,138474
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.5,4.0,4.0,5.0,0.0,3.5,0.0,3.0,2.0,4.0,...,3.0,5.0,0.0,3.0,0.0,5.0,4.0,0.0,4.0,5.0
2,0.0,0.0,3.0,0.0,3.5,0.0,0.0,2.0,1.0,0.0,...,3.0,3.0,0.0,4.0,0.0,2.5,0.0,0.0,0.0,4.0
3,0.0,0.0,0.0,0.0,3.0,4.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Movie Recommendation System

In [56]:
import pandas as pd
import os

# Verify the current working directory
print("Current working directory:", os.getcwd())

# Verify that the file exists
file_exists = os.path.isfile('movie.csv')
print("Does 'movie.csv' exist in the current directory?", file_exists)

# Try to read the CSV file
try:
    movie_metadata = pd.read_csv('movie.csv')
    print("CSV file loaded successfully.")
    print(movie_metadata.head())
except FileNotFoundError as e:
    print(e)

# Ensure the movie DataFrame is defined
movie = movie_metadata

# Define the movie_recommendation function
def movie_recommendation(movie_id, no_of_nearest_neighbors):
    # Placeholder logic for getting similar movies
    similar_movies_ids = movie['movieId'].sample(no_of_nearest_neighbors).values
    
    cf_recs = []
    for i in similar_movies_ids:
        # Using .iloc to access the data by positional index instead of label
        cf_recs.append({'Movie Id': i, 'Title': movie[movie['movieId'] == i]['title'].values[0]})

    # Select top number of recommendations needed
    df = pd.DataFrame(cf_recs, index=range(1, no_of_nearest_neighbors + 1))
    return df

# Example usage:
chosen_movie_id = 1
recommended_movies = movie_recommendation(chosen_movie_id, 10)  # Replace 1 with the movie ID for which you want recommendations
print('Chosen movie based on movie id: ', movie[movie['movieId'] == chosen_movie_id]['title'].values[0])
print("Recommended movies:")
print(recommended_movies)


Current working directory: C:\Users\1\Desktop
Does 'movie.csv' exist in the current directory? True
CSV file loaded successfully.
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Chosen movie based on movie id:  Toy Story (1995)
Recommended movies:
    Movie Id                                         Title
1     118932         The Uncommon Making of Petulia (2006)
2     128542                               Wyrmwood (2015)
3      60044                

### Movie Recommendation System with Distance

In [58]:
def movie_recommender_engine(movie_name, matrix, model_name, no_of_nearest_neighbors):
    
    # Extract input movie ID
    movie_id = process.extractOne(movie_name, movie['title'])[2]
    
    # Calculate neighbour distances
    distances, indices =  distances, indices = model_name.kneighbors(matrix.iloc[:, movie_id].values.reshape(1, -1))
    
    similar_movies_ids = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
    # List to store recommendations
    cf_recs = []
    for i in similar_movies_ids:
        cf_recs.append({'Title':movie['title'][i[0]],'Distance':i[1]})
    
    # Select top number of recommendations needed
    df = pd.DataFrame(cf_recs, index = range(1,no_of_nearest_neighbors))
     
    return df

In [59]:
movie_recommender_engine('Batman', usermovie_to_rating_train.T, cf_knn_model, 10)

Unnamed: 0,Title,Distance
1,Mr. Nanny (1993),0.500793
2,My Bodyguard (1980),0.500718
3,Melvin and Howard (1980),0.496316
4,"Road Warrior, The (Mad Max 2) (1981)",0.493298
5,My House in Umbria (2003),0.492907
6,Red Beard (Akahige) (1965),0.481609
7,"White Sound, The (Das weiße Rauschen) (2001)",0.481285
8,Go Figure (Va savoir) (2001),0.47816
9,American History X (1998),0.468807
