# 📚 Import Required Libraries

In [27]:
import pandas as pd                             # Data manipulation
from math import sqrt                           # Only import sqrt from math for correlation
import numpy as np                              # Numerical operations
import matplotlib.pyplot as plt                 # Plotting
#%matplotlib inline                              # Show plots inside Jupyter Notebook


# 📁 Load Movie and Rating Datasets

In [28]:
movies_df = pd.read_csv('movies.csv')           # Load movie metadata
ratings_df = pd.read_csv('ratings.csv')         # Load user ratings
movies_df.head()                                # Show first few rows


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# 🧹 Extract and Clean Year from Movie Titles

In [29]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand=False)      # Extract year with parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand=False)           # Remove parentheses
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '', regex=True)   # Remove year from title
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())                   # Remove trailing whitespace
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


# 🎭 Split Genres into Lists

In [30]:
movies_df['genres'] = movies_df.genres.str.split('|')    # Split genres into lists
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


# 🧬 One-Hot Encode Genres

In [31]:
moviesWithGenres_df = movies_df.copy()
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 🧼 Drop Irrelevant Columns

In [7]:
ratings_df = ratings_df.drop('timestamp', axis=1)
ratings_df.head()


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# ✍️ User Input for Content-Based Filtering

In [8]:
userInput = [
    {'title': 'Breakfast Club, The', 'rating': 5},
    {'title': 'Toy Story', 'rating': 3.5},
    {'title': 'Jumanji', 'rating': 2},
    {'title': 'Pulp Fiction', 'rating': 5},
    {'title': 'Akira', 'rating': 4.5}
]
inputMovies = pd.DataFrame(userInput)
inputMovies


Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


# 🔍 Match Input with Movie IDs

In [33]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#print(inputId)
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('genres', axis=1).drop('year', axis=1)
#Final input dataframe
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


# 🧪 Filter Genre Info for Input Movies

In [34]:
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies = userMovies.reset_index(drop=True)
userGenreTable = userMovies.drop(['movieId', 'title', 'genres', 'year'], axis=1)   # Clean for matrix math
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 🧮 Build User Profile (Weighted Genres)

In [35]:
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])               # Multiply genres by ratings
userProfile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
War                    0.0
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

# 🧱 Calculate Genre Weights for All Movies

In [12]:
genreTable = moviesWithGenres_df.set_index('movieId')
genreTable = genreTable.drop(['title', 'genres', 'year'], axis=1)
genreTable.head()


Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
genreTable.shape

(9742, 20)

# 🧾 Compute Recommendation Scores

In [37]:
recommendationTable_df = (genreTable * userProfile).sum(axis=1) / userProfile.sum()
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
recommendationTable_df.head()


movieId
134853    0.734266
148775    0.685315
117646    0.678322
6902      0.678322
81132     0.671329
dtype: float64

# 🏆 Display Top Recommendations

In [14]:
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]


Unnamed: 0,movieId,title,genres,year
559,673,Space Jam,"[Adventure, Animation, Children, Comedy, Fanta...",1996
1390,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama...",1998
2250,2987,Who Framed Roger Rabbit?,"[Adventure, Animation, Children, Comedy, Crime...",1988
3460,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
4631,6902,Interstate 60,"[Adventure, Comedy, Drama, Fantasy, Mystery, S...",2002
5490,26340,"Twelve Tasks of Asterix, The (Les douze travau...","[Action, Adventure, Animation, Children, Comed...",1976
5819,32031,Robots,"[Adventure, Animation, Children, Comedy, Fanta...",2005
6047,40339,Chicken Little,"[Action, Adventure, Animation, Children, Comed...",2005
6448,51939,TMNT (Teenage Mutant Ninja Turtles),"[Action, Adventure, Animation, Children, Comed...",2007
6455,52287,Meet the Robinsons,"[Action, Adventure, Animation, Children, Comed...",2007


# 🤝 Collaborative Filtering – User Matching

In [39]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
16,1,296,3.0,964982967
320,4,296,1.0,945173350
422,4,1968,4.0,986934786
516,5,1,4.0,847434962


In [42]:
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)[:10000]

# 🔗 Compute Pearson Correlation

In [43]:
pearsonCorrelationDict = {}
for name, group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    nRatings = len(group)
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList), 2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList), 2)/float(nRatings)
    Sxy = sum(i*j for i, j in zip(tempRatingList, tempGroupList)) - \
          sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy) if Sxx != 0 and Syy != 0 else 0
pearsonCorrelationDict.items()

dict_items([((91,), 0.43852900965351443), ((177,), 0.0), ((219,), 0.45124262819713973), ((274,), 0.716114874039432), ((298,), 0.9592712306918567), ((414,), 0.9376144618769914), ((474,), 0.11720180773462392), ((477,), 0.4385290096535153), ((480,), 0.7844645405527362), ((483,), 0.08006407690254357), ((599,), 0.7666866491579839), ((608,), 0.920736884379251), ((50,), 0.15713484026367722), ((57,), -0.7385489458759964), ((68,), 0.0), ((103,), 0.5222329678670935), ((135,), 0.8703882797784892), ((182,), 0.9428090415820635), ((202,), 0.5222329678670935), ((217,), 0.30151134457776363), ((226,), 0.9438798074485389), ((288,), 0.6005325641789633), ((307,), 0.9655810287305759), ((318,), 0.44486512077567225), ((322,), 0.5057805388588731), ((330,), 0.9035942578600878), ((357,), 0.5606119105813882), ((434,), 0.9864036607532465), ((448,), 0.30151134457776363), ((469,), 0.8164965809277261), ((561,), 0.5222329678670935), ((600,), 0.18442777839082938), ((606,), 0.9146591207600472), ((610,), -0.471404520791

# 📈 Build Similarity DataFrame

In [45]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.438529,"(91,)"
1,0.0,"(177,)"
2,0.451243,"(219,)"
3,0.716115,"(274,)"
4,0.959271,"(298,)"


# 🎯 Top 50 Most Similar Users

In [46]:
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
43,1.0,"(132,)"
181,1.0,"(382,)"
219,1.0,"(602,)"
130,1.0,"(130,)"
129,1.0,"(125,)"


# 🔄 Merge with Ratings

In [47]:
#topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsers['userId'].dtype)
print(ratings_df['userId'].dtype)
print(topUsers['userId'].unique())  # Check unique values in topUsers
print(ratings_df['userId'].unique())  # Check unique values in ratings_df


#topUsers['userId'] = pd.to_numeric(topUsers['userId'], errors='coerce').astype('Int64')
# Extract the first element from the tuples
topUsers['userId'] = topUsers['userId'].apply(lambda x: x[0])

# Convert to integers if necessary
topUsers['userId'] = pd.to_numeric(topUsers['userId'], errors='coerce').astype('Int64')

ratings_df['userId'] = pd.to_numeric(ratings_df['userId'], errors='coerce').astype('Int64')

print(topUsers['userId'].unique())  # Check unique values in topUsers
print(ratings_df['userId'].unique())  # Check unique values in ratings_df


# Now perform the merge
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')


topUsersRating.head()

object
int64
[(132,) (382,) (602,) (130,) (125,) (124,) (119,) (117,) (18,) (112,)
 (105,) (580,) (597,) (76,) (305,) (62,) (609,) (33,) (32,) (144,) (17,)
 (15,) (5,) (525,) (134,) (563,) (528,) (233,) (347,) (399,) (336,) (334,)
 (290,) (279,) (446,) (260,) (254,) (239,) (222,) (221,) (490,) (193,)
 (191,) (187,) (512,) (517,) (178,) (522,) (169,) (527,)]
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp
0,1.0,132,1,2.0,1157921785
1,1.0,132,17,3.0,1157922698
2,1.0,132,29,2.0,1157924165
3,1.0,132,32,3.0,1329983726
4,1.0,132,34,1.5,1157921395


# 🧮 Weighted Recommendation Score

In [48]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex'] * topUsersRating['rating']
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex', 'weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex', 'sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,36.0,124.0
2,18.0,58.0
3,3.0,11.0
5,3.0,8.5
6,13.0,49.5


# 🧾 Final Score and Sorting

In [49]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = \
    tempTopUsersRating['sum_weightedRating'] / tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3035,5.0,3035
1956,5.0,1956
2522,5.0,2522
2495,5.0,2495
2477,5.0,2477


# 🎬 Show Final Collaborative Recommendations

In [50]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
1433,1956,Ordinary People,[Drama],1980
1826,2427,"Thin Red Line, The","[Action, Drama, War]",1998
1842,2450,Howard the Duck,"[Adventure, Comedy, Sci-Fi]",1986
1846,2455,"Fly, The","[Drama, Horror, Sci-Fi, Thriller]",1986
1865,2477,Firewalker,[Adventure],1986
1877,2495,"Fantastic Planet, The (Planète sauvage, La)","[Animation, Sci-Fi]",1973
1898,2522,Airport '77,[Drama],1977
2288,3035,Mister Roberts,"[Comedy, Drama, War]",1955
9380,163112,Winnie the Pooh Goes Visiting,[Animation],1971
9382,163386,Winnie the Pooh and the Day of Concern,[Animation],1972
