In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
movies=pd.read_csv(r"C:\Users\NITHIN\OneDrive\Desktop\movie recommendation\movies.csv")

In [3]:
ratings=pd.read_csv(r"C:\Users\NITHIN\OneDrive\Desktop\movie recommendation\ratings.csv")

In [4]:
links=pd.read_csv(r"C:\Users\NITHIN\OneDrive\Desktop\movie recommendation\links.csv")

In [5]:
tags=pd.read_csv(r"C:\Users\NITHIN\OneDrive\Desktop\movie recommendation\tags.csv")

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [10]:
# Cell 3: Print shape of all datasets

print("Movies shape:", movies.shape)
print("Ratings shape:", ratings.shape)
print("Tags shape:", tags.shape)
print("Links shape:", links.shape)


Movies shape: (9742, 3)
Ratings shape: (100836, 4)
Tags shape: (3683, 4)
Links shape: (9742, 3)


In [11]:
# Cell 4: Check for missing values

print(movies.isnull().sum())
print(ratings.isnull().sum())
print(tags.isnull().sum())
print(links.isnull().sum())


movieId    0
title      0
genres     0
dtype: int64
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
userId       0
movieId      0
tag          0
timestamp    0
dtype: int64
movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [12]:
# Cell 5: Info of all datasets

movies.info()
ratings.info()
tags.info()
links.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-nul

In [13]:
# Cell 1: Import ML libraries

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [14]:
# Cell 2: Combine title and genres to create a text field

movies['combined'] = movies['title'] + " " + movies['genres']
movies[['title', 'genres', 'combined']].head()


Unnamed: 0,title,genres,combined
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [15]:
# Cell 3: Build TF-IDF matrix

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined'])

tfidf_matrix.shape


(9742, 9060)

In [16]:
# Cell 4: Compute cosine similarity matrix

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape


(9742, 9742)

In [17]:
# Cell 5: Create recommendation function

def recommend_movie(title, cosine_sim=cosine_sim):
    # Get index of the movie that matches the title
    idx = movies[movies['title'].str.contains(title, case=False, na=False)].index[0]

    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Select top 10 similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]


In [20]:
# Cell 6: Try recommendation

recommend_movie("harry potter")


6062           Harry Potter and the Goblet of Fire (2005)
4076       Harry Potter and the Chamber of Secrets (2002)
6522     Harry Potter and the Order of the Phoenix (2007)
5166      Harry Potter and the Prisoner of Azkaban (2004)
7465    Harry Potter and the Deathly Hallows: Part 1 (...
7435                                         Stone (2010)
7644    Harry Potter and the Deathly Hallows: Part 2 (...
7078        Harry Potter and the Half-Blood Prince (2009)
6078                             Family Stone, The (2005)
6380                                   Miss Potter (2006)
Name: title, dtype: object