# 1. Movie Recommender System â€“ Data Preparation
This section prepares the dataset by importing, merging, cleaning, and selecting relevant features.


In [None]:
# Importing required libreries and Datasets

import numpy as np 
import pandas as pd

#  Load Datasets
We use TMDB datasets containing movie metadata and credits.


In [None]:
# read datasets
movies_raw = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge Datasets
Merge movies and credits on the 'title' column to unify metadata.


In [None]:
movies = movies_raw.merge(credits, on = 'title')

In [None]:
movies.head(1)

# Select Relevant Features
Keep only the columns required for building the recommender system.


In [None]:
# Selecting relevant features
movies = movies[['movie_id','title','genres','keywords','overview','cast','crew']]

# Handle Missing Values and duplicate values
Drop rows with missing values to ensure clean data and remove duplicate values 


In [None]:
#remove missing values
movies.dropna(inplace = True)

In [None]:
#check for missing values
movies.isnull().sum()

In [None]:
#removie duplicates
movies.drop_duplicates()

In [None]:
#check for duplicates
movies.duplicated().sum()

# 2.Text preprocessing
doing this we will convert JSON like strings into list of names

In [None]:
import ast 

In [None]:
#we will create a helper function to convert the JSON like strings into list of names using ast
def convert(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
 #Function for cast as there is a lot of names that will lead us to wrong recommendation
def convert_cast(obj):
    return [i['name'] for i in ast.literal_eval(obj)[ : 4]]
        

In [None]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [None]:
#function to fetch the name of director ofthe movie
def fetch_director(obj):
      return [i['name'] for i in ast.literal_eval(obj) if i['job'] == 'Director']
            

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
#now we must convert the overview into list from string as we have to concatnate it with other 

movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies.head()

In [None]:
#now we will remove spaces as we have treat every name and words which represents one person or thing but have more than two entities , we will make one entity so it wont cause mislead to recomendation
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
# checking the converted data
movies.head(1)

In [None]:
#now we merge all 4 coluumns in one named tags so the complexicity will be reduced and similarity calculated easily
movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'] + movies['overview']

In [None]:
movies['tags']

In [None]:
movies = movies[['movie_id','title','tags']]

In [None]:
movies

# Text Normalization

In [None]:
# Converting tags list into string and lowercase it to make vectorizable
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x).lower())

In [None]:
movies.head(1)

# 3.Vectorization
 Here We Will Convert text into numerical vectors using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

tfidf = vectorizer.fit_transform(movies["tags"])


In [None]:
#checking vectors
tfidf

In [None]:
vectorizer.get_feature_names_out()

# Stemming 

In [None]:
import nltk

In [None]:
#there is our list but here we will see some similar words like 'action' and 'actions' basically they are same but trated as different 
#which will give us a lengthy data 
#so to make it a base word what actually it means we will use stemming from nltk library

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
#function for stemming
def stemming(text):
    return " ".join(ps.stem(i) for i in text.split())
        

In [None]:
movies['tags'] = movies['tags'].apply(stemming)

# Dumping Files For UI On Streamlit

In [None]:
import pickle

In [None]:
pickle.dump(movies , open('movies.pkl','wb'))

In [None]:
pickle.dump(tfidf,open('tfidf.pkl','wb'))

In [None]:
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))

In [None]:
movies