# Project Objective
This notebook builds a **content-based movie recommendation system** using metadata such as cast, crew, keywords, and genres from the TMDB dataset.

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/content/dataset.csv')

In [None]:
df.shape

(10000, 9)

In [None]:
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


In [None]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [None]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genre,3
original_language,0
overview,13
popularity,0
release_date,0
vote_average,0
vote_count,0


In [None]:
df.duplicated().sum()

0

## 📊 Tag WordCloud
Let's visualize the most frequent terms used in movie tags.

In [None]:
from wordcloud import WordCloud
text = ' '.join(df['tags'])
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud of Tags')
plt.show()

In [None]:
df['tags'] = df['genre'] + df['overview']
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,tags
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,"Drama,CrimeIn the continuing saga of the Corle..."


In [None]:
df_c = df.drop(columns = ['popularity', 'release_date', 'vote_average', 'vote_count', 'genre', 'overview'])

In [None]:
df_c = df_c.drop('original_language', axis = 1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [None]:
cv

In [None]:
vec = cv.fit_transform(df_c['tags'].values.astype('U')).toarray()

astype('U'): This part is crucial for ensuring the text data is in a compatible format. It converts the data type of the text values to Unicode (U), which is a universal character encoding that can represent text in various languages. This step is often necessary to avoid encoding errors when working with text data from different sources.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cos_sim = cosine_similarity(vec)

In [None]:
dist = sorted(list(enumerate(cos_sim[0])), reverse = True, key= lambda x: x[1])

In [None]:
def recommend(movies):
  mindex = df_c[df_c['title'] == movies].index[0]
  dist = sorted(list(enumerate(cos_sim[mindex])), reverse = True, key= lambda x: x[1])
  for i in dist[0:5]:
    print('-'*50)
    print(df_c.iloc[i[0]].title)
#or
"""  for i in dist(0:5):
    print(df_c['title'][dist[i][0]])"""

"  for i in dist(0:5):\n    print(df_c['title'][dist[i][0]])"

In [None]:
recommend('The Dark Knight Rises')

--------------------------------------------------
The Dark Knight Rises
--------------------------------------------------
Batman: The Long Halloween, Part One
--------------------------------------------------
Batman: The Long Halloween, Part Two
--------------------------------------------------
Batman: Bad Blood
--------------------------------------------------
Batman: Gotham by Gaslight


## Example Recommendation Output
Let's test the recommendation function with a popular movie.

In [None]:
recommend('Avatar')

## Summary
- Built a content-based recommendation system
- Used CountVectorizer and cosine similarity
- Future work could involve hybrid models or collaborative filtering techniques