#                         Movie Recommender System

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



### Loading Datasets

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

**Merging Movies and Credits Data**

In [3]:
# Merged the 'movies' and 'credits' datasets on the 'title' column to combine movie details with cast and crew information
movies = movies.merge(credits,on='title')

**Selecting Relevant Columns**

In [4]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [5]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [6]:
movies.columns

Index(['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew'], dtype='object')

In [7]:
# Rename the '_x' columns back to their original names
movies.rename(columns={'movie_id_x': 'movie_id', 'cast_x': 'cast', 'crew_x': 'crew'}, inplace=True)

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [8]:
movies.shape

(4809, 7)

### Data Cleaning

**1. Handling Missing Data**

In [9]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
movies = movies.dropna(subset=['overview'])

**2. Checking for Duplicates**

In [11]:
movies.duplicated().sum()

0

### Exploratory Data Analysis (EDA)

**1. Basic Information**

In [12]:
print(movies.info())  
print(movies.describe()) 

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   overview  4806 non-null   object
 3   genres    4806 non-null   object
 4   keywords  4806 non-null   object
 5   cast      4806 non-null   object
 6   crew      4806 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.4+ KB
None
            movie_id
count    4806.000000
mean    56922.559509
std     88309.447559
min         5.000000
25%      9009.750000
50%     14615.500000
75%     58476.750000
max    447027.000000


**2. Data Distribution**

In [29]:
genres_exploded = movies.explode('genres')

# Count occurrences of each genre
genres_count = genres_exploded['genres'].value_counts().reset_index()
genres_count.columns = ['Genre', 'Number of Movies']

# Plot the genre counts using Plotly
fig = px.bar(genres_count, 
             x='Genre', 
             y='Number of Movies', 
             title='Number of Movies per Genre', 
             color='Genre',  
             text='Number of Movies',  
             color_discrete_sequence=px.colors.qualitative.Set3)  

fig.update_layout(
    xaxis_title='Genre',
    yaxis_title='Number of Movies',
    template='none',  
    plot_bgcolor='white',
    paper_bgcolor='white', 
    xaxis_tickangle=-45,   
    height=600,
    width=1000,
    font=dict(size=14),    
    title_font=dict(size=20) 
)

fig.update_traces(texttemplate='%{text}', textposition='outside')

# Show the plot
fig.show()

### Data Preprocessing

 **1. Convert Stringified Columns to Lists**

In [14]:
def convert(text):
    if isinstance(text, str): 
        text = ast.literal_eval(text)
    return [i['name'] if isinstance(i, dict) else i for i in text]

# Apply conversion to 'genres' and 'keywords' columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

**2. Processing Cast (Top 3 Actors)**

In [15]:
def convert_cast(text):
    L = []
    counter = 0
    if isinstance(text, str):
        try:
            text = ast.literal_eval(text)
        except (ValueError, SyntaxError):
            return [] 

    if isinstance(text, list):
        for i in text:
            if isinstance(i, dict) and 'name' in i:
                if counter < 3:  
                    L.append(i['name'])
                    counter += 1
                else:
                    break
    return L

# Apply the function to the 'cast' column
movies['cast'] = movies['cast'].apply(convert_cast)

**3. Processing Crew (Director Only)**

In [16]:
def fetch_director(text):
    if isinstance(text, str):
        try:
            text = ast.literal_eval(text)
        except (ValueError, SyntaxError):
            return []
    
    if isinstance(text, list):
        for i in text:
            if isinstance(i, dict) and i.get('job') == 'Director':
                return [i['name']]  
    return [] 

# Apply the function to the 'crew' column
movies['crew'] = movies['crew'].apply(fetch_director)

**4. Removing Spaces from Tags**

In [17]:
def collapse(L):
    return [i.replace(" ", "") for i in L]

movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

**5. Tokenizing Overview Column**

In [18]:
def split_if_string(x):
    if isinstance(x, str):
        return x.split()
    return x 

# Apply the function to the 'overview' column
movies['overview'] = movies['overview'].apply(split_if_string)

### Creating Tags Column

In [19]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [20]:
new_movies_df = movies[['movie_id','title','tags']]

### Converting Tags to Strings and Lowercase

In [21]:
new_movies_df.loc[:, 'tags'] = new_movies_df['tags'].apply(lambda x: " ".join(x))

In [22]:
new_movies_df.loc[:, 'tags'] = new_movies_df['tags'].apply(lambda x: x.lower())

### Preprocessing Text Data

In [23]:
# Download stopwords 
nltk.download('stopwords')

ps = PorterStemmer()

def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Split the text into tokens
    text = text.split()
    
    # Remove stopwords and apply stemming
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    
    # Join the words back into a single string
    return ' '.join(text)

# Apply the preprocessing function to the 'tags' column
new_movies_df.loc[:, 'tags'] = new_movies_df['tags'].apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### TF-IDF Vectorization

In [24]:
# Initialize TfidfVectorizer with maximum features and stop words
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Apply TF-IDF vectorization on the 'tags' column
vector = tfidf.fit_transform(new_movies_df['tags']).toarray()

### Calculating Cosine Similarity

In [25]:
similarity = cosine_similarity(vector)

### Recommendation Function

In [34]:
from IPython.display import display, HTML

def recommend(movie):
    try:
        # Find the index of the movie
        index = new_movies_df[new_movies_df['title'] == movie].index[0]  
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])  

        # Build the HTML content
        html = f"""
        <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background-color: #f4f4f9; padding: 20px; border-radius: 15px; box-shadow: 0 0 15px rgba(0, 0, 0, 0.1); max-width: 600px; margin: auto;">
            <h2 style="color: #2c3e50; text-align: center;">🎬 Movies Similar to '{movie}':</h2>
            <ul style="list-style-type: none; padding: 0;">
        """
        for i in distances[1:6]:  
            title = new_movies_df.iloc[i[0]].title
            html += f'''
            <li style="padding: 15px; border-bottom: 2px solid #ecf0f1; color: #34495e; font-size: 18px; transition: background-color 0.3s;">
                <strong style="color: #3498db;">{title}</strong> 
                <span style="float: right; color: #e74c3c;">Similarity: {i[1]:.2f}</span>
            </li>'''
        html += "</ul></div>"

        # Display the HTML content
        display(HTML(html))
    except IndexError:
        display(HTML(f"<div style='font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; color: #e74c3c; text-align: center;'><h2>'{movie}' not found in the dataset.</h2></div>"))

# Testing the Recommender
recommend('Avatar')

In [27]:
import joblib

# Save TF-IDF Vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Save Cosine Similarity Matrix
joblib.dump(similarity, 'cosine_similarity_matrix.pkl')

['cosine_similarity_matrix.pkl']

In [28]:
new_movies_df.to_csv('new_movies_df.csv', index=False)