In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('/content/movies.csv')

In [3]:
data.head()

Unnamed: 0,movie_id,title,overview,release_date,genres,tmdb_rating,imdb_rating,box_office,awards,runtime,created_at
0,16250,Love Comes Softly,Nineteen year old pioneer woman Marty has rece...,2003-04-13,"TV Movie,Western,Romance,Drama",6.7,7.3,,3 wins,88 min,2024-11-19 22:39:30.262276+00
1,1103825,War of the Worlds: The Attack,Three young astronomers fight to survive a dea...,2023-04-21,"Science Fiction,Mystery,Thriller",5.1,3.2,,1 nomination,85 min,2024-11-19 22:39:31.814267+00
2,39106,Dragon Ball Z: Bio-Broly,"Jaga Bada, Mr. Satan's old sparring partner, h...",1994-07-09,"Animation,Action,Science Fiction",5.741,5.8,,,47 min,2024-11-19 22:39:36.201249+00
3,1051896,Arcadian,"In the near future, on a decimated Earth, Paul...",2024-04-12,"Action,Horror,Thriller,Science Fiction",6.1,5.5,"$828,919",1 nomination,92 min,2024-11-19 07:05:45.184981+00
4,1124641,Classified,Operating alone in the field for more than 20 ...,2024-09-19,"Action,Thriller",5.6,3.8,,,105 min,2024-11-19 07:05:45.08505+00


# **Data Preprocessing**

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      1450 non-null   int64  
 1   title         1450 non-null   object 
 2   overview      1441 non-null   object 
 3   release_date  1450 non-null   object 
 4   genres        1445 non-null   object 
 5   tmdb_rating   1450 non-null   float64
 6   imdb_rating   1270 non-null   float64
 7   box_office    885 non-null    object 
 8   awards        1029 non-null   object 
 9   runtime       1313 non-null   object 
 10  created_at    1450 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 124.7+ KB


In [5]:
#drop irrelevant columns
data = data.drop(columns=['movie_id', 'release_date', 'tmdb_rating', 'imdb_rating',
                           'box_office', 'awards', 'runtime', 'created_at'])

In [6]:
#check missing value
data.isnull().sum()

Unnamed: 0,0
title,0
overview,9
genres,5


In [7]:
# drop rows if missing val < 10%
cols_to_drop = data.columns[(data.isnull().mean() < 0.1) & (data.isnull().mean() > 0)]
data = data.dropna(subset=cols_to_drop)

In [8]:
print(data.isnull().sum())

title       0
overview    0
genres      0
dtype: int64


In [9]:
import re
import nltk

def clean_text(text):
  #convert into lowercase
  text = text.lower()

  #remova punctuation
  text = re.sub(r'[^a-zA-Z\s]', '', text)

  #remove digits
  text = re.sub(r'\d+', '', text)

  #remove newline characters
  text = re.sub(r'\n', ' ', text)

  return text

In [10]:
#apply the function to overview
data['overview'] = data['overview'].apply(clean_text)

In [11]:
data['overview']

Unnamed: 0,overview
0,nineteen year old pioneer woman marty has rece...
1,three young astronomers fight to survive a dea...
2,jaga bada mr satans old sparring partner has i...
3,in the near future on a decimated earth paul a...
4,operating alone in the field for more than ye...
...,...
1445,after the second impact tokyo is being attacke...
1446,a mysterious woman recruits bank teller ludwig...
1447,a young beautiful career woman rents a backwoo...
1448,the herdman kids are undeniably the worst kids...


In [12]:
def clean_genres(text):
    text = text.lower()
    text = text.replace(" ", "")
    text = text.replace(",", " ")
    return text

In [13]:
#apply the function
data['genres'] = data['genres'].apply(clean_genres)

In [14]:
data['genres']

Unnamed: 0,genres
0,tvmovie western romance drama
1,sciencefiction mystery thriller
2,animation action sciencefiction
3,action horror thriller sciencefiction
4,action thriller
...,...
1445,animation sciencefiction action drama
1446,action crime comedy
1447,horror thriller
1448,comedy drama family


# **Feature Extraction**

In [15]:
#combine overview and genre to create a new feature
def create_mix(x):
    return f"{x['overview']} {x['genres']}".strip()

data['mix'] = data.apply(create_mix, axis=1)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

In [17]:
#convert text data to TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(data['mix'])

#print matrix shape
print(tfidf_matrix.shape)

(1438, 10750)


# **Content Based Recommenders**

In [18]:
#import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

#compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
data = data.reset_index()
indices = pd.Series(data.index, index=data['title'])

In [25]:
# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [26]:
get_recommendations('Despicable Me 4', cosine_sim)

Unnamed: 0,title
529,Despicable Me 2
775,Despicable Me 3
244,Minions: The Rise of Gru
254,Despicable Me
95,Migration
172,Original Sin
444,No Time to Spy: A Loud House Movie
91,Focus
773,Alice Through the Looking Glass
1028,No Manches Frida 2: Paradise Lost


In [27]:
get_recommendations('Ponyo', cosine_sim)

Unnamed: 0,title
604,Pinocchio
253,The Little Mermaid
1378,The Secret World of Arrietty
79,The Predator
908,The NeverEnding Story
573,"Run, Tiger Run!"
225,Elevation
525,Shark Tale
463,The Good Dinosaur
1177,Castle in the Sky
