In [30]:
import pandas as pd
import numpy as np
df = pd.read_csv("movies.csv")


df.head(7)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance


In [14]:
df.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [16]:
df = df[['title', 'genres']]

# Replace | with space so genres act like words
df['genres'] = df['genres'].str.replace('|', ' ', regex=False)
# df = df.rename(columns={"rating": "Rating"})
df.head(7)


Unnamed: 0,title,genres
0,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,Jumanji (1995),Adventure Children Fantasy
2,Grumpier Old Men (1995),Comedy Romance
3,Waiting to Exhale (1995),Comedy Drama Romance
4,Father of the Bride Part II (1995),Comedy
5,Heat (1995),Action Crime Thriller
6,Sabrina (1995),Comedy Romance


In [17]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\(\d{4}\)', '', text)  # remove year
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['title_clean'] = df['title'].apply(clean_text)
df['genres_clean'] = df['genres'].apply(clean_text)


In [21]:
df['content'] = df['title_clean'] + ' ' + df['genres_clean']
df[['title', 'content']].head()

Unnamed: 0,title,content
0,Toy Story (1995),toy story adventure animation children comedy ...
1,Jumanji (1995),jumanji adventure children fantasy
2,Grumpier Old Men (1995),grumpier old men comedy romance
3,Waiting to Exhale (1995),waiting to exhale comedy drama romance
4,Father of the Bride Part II (1995),father of the bride part ii comedy


In [22]:
df.head()

Unnamed: 0,title,genres,title_clean,genres_clean,content
0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,toy story,adventure animation children comedy fantasy,toy story adventure animation children comedy ...
1,Jumanji (1995),Adventure Children Fantasy,jumanji,adventure children fantasy,jumanji adventure children fantasy
2,Grumpier Old Men (1995),Comedy Romance,grumpier old men,comedy romance,grumpier old men comedy romance
3,Waiting to Exhale (1995),Comedy Drama Romance,waiting to exhale,comedy drama romance,waiting to exhale comedy drama romance
4,Father of the Bride Part II (1995),Comedy,father of the bride part ii,comedy,father of the bride part ii comedy


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


tfidf = TfidfVectorizer(stop_words='english',max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['content'])

In [26]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend(movie_title, n=10):
    idx = indices[movie_title]

    # Compute similarity ONLY for one movie
    sim_scores = cosine_similarity(
        tfidf_matrix[idx],
        tfidf_matrix
    ).flatten()

    top_indices = sim_scores.argsort()[::-1][1:n+1]
    return df['title'].iloc[top_indices]


In [28]:
recommend("Toy Story (1995)" )


3021                    Toy Story 2 (1999)
59767                   Toy Story 4 (2019)
14813                   Toy Story 3 (2010)
24064    Toy Story That Time Forgot (2014)
20497           Toy Story of Terror (2013)
4823                       Toy, The (1982)
2071         NeverEnding Story, The (1984)
44838      The Story of the Voyages (1983)
37712                Toy Reanimator (2002)
22634    Toy Story Toons: Small Fry (2011)
Name: title, dtype: object

In [29]:
recommend("Heat (1995)")

24305         Java Heat (2013)
19925         Heat, The (2013)
28258              Heat (1986)
26060      The Big Heat (1988)
48401              Heat (1963)
30995        Black Heat (1976)
22964         Dead Heat (2002)
46552          The Heat (2006)
25877    Heat Lightning (1934)
28076    Catch the Heat (1987)
Name: title, dtype: object