In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
movies = pd.read_csv('./CSV/movies.csv')
ratings = pd.read_csv('./CSV/ratings.csv')
tags = pd.read_csv('./CSV/tags.csv')

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
#duplikates = movies[movies.duplicated('title', keep=False)].sort_values("title")
new_movies = movies.drop_duplicates("title").copy()

In [None]:
title_sep = new_movies["title"].str.rsplit(" ", n=1, expand = True)
new_movies["movie title"] = title_sep[0]
new_movies["year"] = title_sep[1]
new_movies.drop(columns=["title"], inplace=True)
new_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86330 entries, 0 to 86536
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieId      86330 non-null  int64 
 1   genres       86330 non-null  object
 2   movie title  86330 non-null  object
 3   year         86217 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.3+ MB


In [7]:
movies_with_ratings = ratings.merge(new_movies, on="movieId")
movies_with_ratings.drop(["timestamp"], axis=1, inplace=True)
movies_with_ratings.head()

Unnamed: 0,userId,movieId,rating,genres,movie title,year
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,(1995)
1,1,110,4.0,Action|Drama|War,Braveheart,(1995)
2,1,158,4.0,Adventure|Children,Casper,(1995)
3,1,260,4.5,Action|Adventure|Sci-Fi,Star Wars: Episode IV - A New Hope,(1977)
4,1,356,5.0,Comedy|Drama|Romance|War,Forrest Gump,(1994)


In [8]:
movies_with_ratings.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33828467 entries, 0 to 33828466
Data columns (total 6 columns):
 #   Column       Non-Null Count     Dtype  
---  ------       --------------     -----  
 0   userId       33828467 non-null  int64  
 1   movieId      33828467 non-null  int64  
 2   rating       33828467 non-null  float64
 3   genres       33828467 non-null  object 
 4   movie title  33828467 non-null  object 
 5   year         33820549 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 1.5+ GB


In [9]:
movies_with_ratings.isna().sum()

userId            0
movieId           0
rating            0
genres            0
movie title       0
year           7918
dtype: int64

In [10]:
movies_with_ratings.fillna('(Release year unknown)', inplace=True)

In [11]:
movies_with_ratings.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33828467 entries, 0 to 33828466
Data columns (total 6 columns):
 #   Column       Non-Null Count     Dtype  
---  ------       --------------     -----  
 0   userId       33828467 non-null  int64  
 1   movieId      33828467 non-null  int64  
 2   rating       33828467 non-null  float64
 3   genres       33828467 non-null  object 
 4   movie title  33828467 non-null  object 
 5   year         33828467 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 1.5+ GB


In [12]:
x = movies_with_ratings.groupby("userId").count()["rating"] > 100
expert_users = x[x].index

filtered_user_ratings = movies_with_ratings[movies_with_ratings["userId"].isin(expert_users)]

y = filtered_user_ratings.groupby("movie title").count()["rating"] >= 50
famous_movies = y[y].index

user_ratings = filtered_user_ratings[filtered_user_ratings["movie title"].isin(famous_movies)]

In [13]:
design_matrix = user_ratings.pivot_table(index="movie title", columns="userId", values="rating")

design_matrix.fillna(0, inplace=True)

In [14]:
scaler = StandardScaler(with_mean=True, with_std=True)
design_matrix_centered = scaler.fit_transform(design_matrix)

In [15]:
similiarity_score = cosine_similarity(design_matrix_centered)
similiarity_score

array([[1.        , 0.20979135, 0.36035013, ..., 0.33537196, 0.21248281,
        0.32273158],
       [0.20979135, 1.        , 0.29112775, ..., 0.30047266, 0.19151363,
        0.29153297],
       [0.36035013, 0.29112775, 1.        , ..., 0.48185077, 0.30609494,
        0.44522109],
       ...,
       [0.33537196, 0.30047266, 0.48185077, ..., 1.        , 0.32272393,
        0.45642736],
       [0.21248281, 0.19151363, 0.30609494, ..., 0.32272393, 1.        ,
        0.29247883],
       [0.32273158, 0.29153297, 0.44522109, ..., 0.45642736, 0.29247883,
        1.        ]], shape=(15267, 15267))

In [16]:
def recommend(movie):
    index = np.where(design_matrix.index==movie)[0][0]
    similar_movies = sorted(list(enumerate(similiarity_score[index])), key=lambda x: x[1], reverse=True)[1:6]
    data = []

    for index, similarity in similar_movies:
        item = []
        temp_df = new_movies[new_movies["movie title"]==design_matrix.index[index]]
        item.extend(temp_df["movie title"].values)
        item.extend(temp_df["year"].values)
        data.append(item)
    return data

In [18]:
recommend("Titanic")

[['Forrest Gump', '(1994)'],
 ['Men in Black (a.k.a. MIB)', '(1997)'],
 ['Matrix, The', '(1999)'],
 ['Sixth Sense, The', '(1999)'],
 ['Saving Private Ryan', '(1998)']]