Predicting future customer preferences to enhance customer satisfaction and increase sales. To unearth the hidden niche movie products.

In [None]:
#Data processing
import pandas as pd
import numpy as np
import scipy.stats

#visualization
import seaborn as sns
import matplotlib.pyplot as plt

#similarity
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ratings =pd.read_csv('ratings.csv')
print(ratings.shape)
ratings.tail()

In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
ratings.rating.unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [11]:

movies=pd.read_csv('movies.csv')
print(movies.shape)
movies.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.movieId.nunique()

9742

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [8]:
tags=pd.read_csv('tags.csv')
print(tags.shape)
tags.tail(5)

(3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978
3682,610,168248,Heroic Bloodshed,1493844270


In [9]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [10]:
link=pd.read_csv('links.csv')
print(link.shape)
link.head()

FileNotFoundError: [Errno 2] No such file or directory: 'links.csv'

In [12]:
#merging datasets
df= pd.merge(ratings,movies, on = 'movieId', how ='inner')

In [None]:
df.head()


In [13]:
#filter the movies and keep movies and those with over 50 ratings for our analysis
#1. group movies by title
#2.count the number of ratings
#3. keep only movies >50 rating
#Aggregate by movie
aggregated_ratings = df.groupby('title').agg(mean_rating = ('rating','mean'),number_of_ratings =('rating', 'count')).reset_index()
#keeping movies with over 50 ratings
aggregated_ratings_50 = aggregated_ratings[aggregated_ratings['number_of_ratings']>50]
aggregated_ratings_50.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437 entries, 18 to 9703
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              437 non-null    object 
 1   mean_rating        437 non-null    float64
 2   number_of_ratings  437 non-null    int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 13.7+ KB


In [14]:
#check the most popular movies and their ratings
aggregated_ratings_50.sort_values(by='number_of_ratings', ascending =False).head()

Unnamed: 0,title,mean_rating,number_of_ratings
3158,Forrest Gump (1994),4.164134,329
7593,"Shawshank Redemption, The (1994)",4.429022,317
6865,Pulp Fiction (1994),4.197068,307
7680,"Silence of the Lambs, The (1991)",4.16129,279
5512,"Matrix, The (1999)",4.192446,278


In [None]:
#visualising the correlation between average ratings and number of ratings
import seaborn as sns
import matplotlib.pyplot as plt
sns.scatterplot(x='mean_rating',y='number_of_ratings', data=aggregated_ratings_50)

In [None]:
#most movies have less than 150 number of ratings
#most movies in the dataset has an average ratings of between 3-4
#

In [15]:
#merging our dataframe with the filtered data
df_50= pd.merge(df,aggregated_ratings_50[['title']], on='title', how ='inner')
df_50.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40712 entries, 0 to 40711
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     40712 non-null  int64  
 1   movieId    40712 non-null  int64  
 2   rating     40712 non-null  float64
 3   timestamp  40712 non-null  int64  
 4   title      40712 non-null  object 
 5   genres     40712 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 2.2+ MB


In [16]:
#Number of users
df_50.userId.nunique()

606

In [17]:
#Number of movies rated
df_50.movieId.nunique()

438

In [18]:
#Number of ratings
df_50.rating.nunique()

10

In [20]:
#list of unique ratings
sorted(df_50.rating.unique())

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

In [19]:
#create user-movie matrix
matrix = df_50.pivot_table(index='title', columns ='userId', values='rating')
matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),,,,,,,,,,,...,,,3.0,,5.0,,,,,
12 Angry Men (1957),,,,5.0,,,,,,,...,5.0,,,,,,,,,
2001: A Space Odyssey (1968),,,,,,,4.0,,,,...,,,5.0,,,5.0,,3.0,,4.5
28 Days Later (2002),,,,,,,,,,,...,,,,,,,,3.5,,5.0
300 (2007),,,,,,,,,,3.0,...,,,,,3.0,,,5.0,,4.0


In [21]:
#data normalization
#we'll subtract average ratings of each movie from our matrix to get a mean centered cosine similarity.
#after nomalization we expect to get a negative value for rating below the average mean and positive values for above the mean average rating
normalized_matrix = matrix.subtract(matrix.mean(axis =1), axis =0)
normalized_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),,,,,,,,,,,...,,,-0.527778,,1.472222,,,,,
12 Angry Men (1957),,,,0.850877,,,,,,,...,0.850877,,,,,,,,,
2001: A Space Odyssey (1968),,,,,,,0.105505,,,,...,,,1.105505,,,1.105505,,-0.894495,,0.605505
28 Days Later (2002),,,,,,,,,,,...,,,,,,,,-0.474138,,1.025862
300 (2007),,,,,,,,,,-0.68125,...,,,,,-0.68125,,,1.31875,,0.31875


In [22]:
#calculating the similarity score
#1. using pearson correlation
item_similarity = normalized_matrix.T.corr()
item_similarity.head()

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Wild Wild West (1999),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),1.0,0.0,-0.253516,0.107948,0.339416,0.1642,-0.594988,-0.3814107,-0.255057,-0.289758,...,0.191407,0.04301,-0.216664,0.725502,0.115256,0.109621,0.001423,0.443656,0.27634,-0.171146
12 Angry Men (1957),0.0,1.0,0.040225,-0.009787,-0.286069,0.07152,0.311501,-0.2030856,-0.02376,0.288207,...,0.130767,0.177233,0.406119,0.245144,-0.322066,-0.012245,-0.388102,0.21806,0.065653,0.336515
2001: A Space Odyssey (1968),-0.253516,0.040225,1.0,0.048894,0.114073,0.313433,0.020369,-0.09457229,-0.036319,0.017644,...,-0.183708,0.087803,-0.016502,0.355364,-0.123862,0.11615,-0.0989,-0.014795,-0.463892,-0.372286
28 Days Later (2002),0.107948,-0.009787,0.048894,1.0,0.195523,0.648477,0.079585,-2.3596410000000003e-17,-0.176399,-0.212558,...,-0.150651,0.171987,0.230005,0.462428,0.248323,0.098859,0.427635,0.106995,0.631143,0.216676
300 (2007),0.339416,-0.286069,0.114073,0.195523,1.0,0.341233,0.177706,-0.3771549,0.267142,0.232495,...,0.091625,-0.208296,-0.389535,0.688245,0.160559,0.192747,0.347141,0.179883,0.313584,-0.128771


In [None]:
# The Pearson correlation coefficient ranges from -1 to 1, with 1 indicating a perfect positive correlation, -1 indicating a perfect negative correlation, and 0 indicating no correlation. In the context of movie recommendations, a positive correlation suggests that users who liked one movie are likely to like the other, while a negative correlation suggests the opposite.

# Let's take an example from the table:

# For "10 Things I Hate About You (1999)" and "12 Angry Men (1957)", the Pearson similarity coefficient is 0.0. This implies that there is no linear correlation between the user ratings for these two movies. It doesn't necessarily mean that users dislike one when they like the other; it simply suggests that there's no linear relationship between the ratings.

# For "10 Things I Hate About You (1999)" and "2001: A Space Odyssey (1968)", the Pearson similarity coefficient is -0.253516. This negative value suggests a weak negative correlation. In the context of movie recommendations, it might indicate that users who liked "10 Things I Hate About You" are, on average, less likely to have liked "2001: A Space Odyssey," and vice versa.

In [23]:
#2. using cosine similarity
#NB cosine similarity does not take missing values so we have to drop the missing values before running the code
cosine_item_similarity = cosine_similarity(normalized_matrix.fillna(0))

cosine_item_similarity

array([[ 1.        , -0.01617572, -0.05679086, ...,  0.04744522,
         0.03846235, -0.0345159 ],
       [-0.01617572,  1.        ,  0.0090292 , ...,  0.02095641,
         0.02196174,  0.02128938],
       [-0.05679086,  0.0090292 ,  1.        , ..., -0.00617574,
        -0.12685796, -0.10601816],
       ...,
       [ 0.04744522,  0.02095641, -0.00617574, ...,  1.        ,
         0.0384687 ,  0.05630225],
       [ 0.03846235,  0.02196174, -0.12685796, ...,  0.0384687 ,
         1.        , -0.03608389],
       [-0.0345159 ,  0.02128938, -0.10601816, ...,  0.05630225,
        -0.03608389,  1.        ]])

In [24]:
# A value close to 1 suggests high similarity between the corresponding items or entities.
# A value close to -1 suggests high dissimilarity.
# A value close to 0 suggests little or no similarity

In [None]:
#predicting user's ratings for one movie
#let's use user 601 and say the  movie 10 Things I Hate About You (1999)
# step 1: create a list of movies user 601 has watched and rated
# step 2: rank the similarities between the movies user 601  rated and the movie movie 10 Things I Hate About You
# step 3: select top n movies and highest similarity scores
# step 4: calculate the predicted rating using weighted average of the similarity score and the ratings from user 601

In [25]:
#sort the movie by ratings and remove all movies with missing ratings for user 601
user = 601
movie = '10 Things I Hate About You (1999)'
movies_watched = pd.DataFrame(normalized_matrix[user].dropna(axis =0, how = 'all').sort_values(ascending = False)).reset_index().rename(columns = {601:'rating'})
movies_watched.head()

Unnamed: 0,title,rating
0,Iron Man (2008),1.175532
1,"Incredibles, The (2004)",1.164
2,Ratatouille (2007),1.131944
3,"Monsters, Inc. (2001)",1.128788
4,How to Train Your Dragon (2010),1.056604


In [26]:
#next we will get the similarity score of the movies watched by user 601 with the movie 10 Things I Hate About You (1999)

movie_similarity_score = item_similarity[[movie]].reset_index().rename(columns={'10 Things I Hate About You (1999)':'similarity_score'})

# Rank the similarities between the movies user 601 rated and 10 Things I Hate About You (1999).
n = 5
movies_watched_similarity = pd.merge(left=movies_watched, 
                                            right=movie_similarity_score, 
                                            on='title', 
                                            how='inner')\
                                     .sort_values('similarity_score', ascending=False)[:5]

# Take a look at the User 601 watched movies with highest similarity
movies_watched_similarity

Unnamed: 0,title,rating,similarity_score
39,"Dark Knight Rises, The (2012)",0.506579,0.819413
35,"Wolf of Wall Street, The (2013)",0.583333,0.725502
31,"Avengers, The (2012)",0.630435,0.706423
6,Interstellar (2014),1.006849,0.63303
4,How to Train Your Dragon (2010),1.056604,0.523112


In [28]:
# Assuming movies_watched_similarity is your DataFrame
movies_watched_similarity.rename(columns={'601': 'rating'}, inplace=True)


In [29]:
movies_watched_similarity.head()

Unnamed: 0,title,rating,similarity_score
39,"Dark Knight Rises, The (2012)",0.506579,0.819413
35,"Wolf of Wall Street, The (2013)",0.583333,0.725502
31,"Avengers, The (2012)",0.630435,0.706423
6,Interstellar (2014),1.006849,0.63303
4,How to Train Your Dragon (2010),1.056604,0.523112


In [30]:
#next we will calculate the weighted average ratings and similarity which now becomes the predicted ratings for the movie 10 Things I Hate About You (1999)
# Calculate the predicted rating using weighted average of similarity scores and the ratings from user 601
predicted_rating = round(np.average(movies_watched_similarity['rating'],weights=movies_watched_similarity.similarity_score),3)
predicted_rating

0.726

In [31]:
# Item-based recommendation function
def item_based_rec(user=60, number_of_similar_items=5, number_of_recommendations =3):
  import operator
  # Movies that the target user has not watched
  user_unwatched_movies = pd.DataFrame(normalized_matrix[user].isna()).reset_index()
  user_unwatched_movies = user_unwatched_movies[user_unwatched_movies[60]==True]['title'].values.tolist()

  # Movies that the target user has watched
  user_watched = pd.DataFrame(normalized_matrix[user].dropna(axis=0, how='all').sort_values(ascending=False)).reset_index().rename(columns={60:'rating'})
  
  # Dictionary to save the unwatched movie and predicted rating pair
  rating_prediction ={}  

  # Loop through unwatched movies          
  for picked_movie in user_unwatched_movies: 
    # Calculate the similarity score of the picked movie with other movies
      movie_similarity_score = item_similarity[[picked_movie]].reset_index().rename(columns={picked_movie:'similarity_score'})
    # Rank the similarities between the picked user watched movie and the picked unwatched movie.
      picked_userid_watched_similarity = pd.merge(left=user_watched, 
                                                right=movie_similarity_score, 
                                                on='title', 
                                                how='inner')\
                                        .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
    # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 601
      predicted_rating = round(np.average(picked_userid_watched_similarity['rating'], 
                                        weights=picked_userid_watched_similarity['similarity_score']), 3)
    # Save the predicted rating in the dictionary
      rating_prediction[picked_movie] = predicted_rating
    # Return the top recommended movies
  return sorted(rating_prediction.items(), key=operator.itemgetter(1), reverse=True)[:number_of_recommendations]

# Get recommendations
#recommended_movie = item_based_rec(user=601, number_of_similar_items=5, number_of_recommendations =10)
#recommended_movie
reco = item_based_rec(user=60, number_of_similar_items=5, number_of_recommendations =10)
reco

[('Stargate (1994)', 4.212),
 ('Johnny Mnemonic (1995)', 1.244),
 ('Harry Potter and the Goblet of Fire (2005)', 0.942),
 ('Disclosure (1994)', 0.941),
 ('Avengers, The (2012)', 0.602),
 ('Ace Ventura: Pet Detective (1994)', 0.598),
 ('Star Trek: First Contact (1996)', 0.564),
 ('Broken Arrow (1996)', 0.557),
 ('Gone in 60 Seconds (2000)', 0.51),
 ('Tombstone (1993)', 0.453)]