# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Load the dataset

In [None]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [None]:
# view first 2 rows of movies
movies.head(2)

In [None]:
# view first 2 rows of credits
credits.head(2)

In [None]:
# check how many rows and columns avalable in movies dataset
movies.shape

In [None]:
# check how many rows and columns avalable in credits dataset
credits.shape

# Merge Datasets

In [None]:
movies = movies.merge(credits, on='title')

In [None]:
# view first 2 rows of new movies dataset
movies.head(2)

In [None]:
# check how many rows and columns avalable in new movies dataset
movies.shape

# Preprocess the data

In [None]:
# check what are the clomns in new movies dataset
movies.columns

In [None]:
# choose only some needed colomns only
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_count']]

In [None]:
# print new movie dataset
movies

In [None]:
# check no of colomns and rows
movies.shape

Remove missing values

In [None]:
# check missing values
movies.isnull().sum()

In [None]:
# drop missing values
movies.dropna(inplace=True)

In [None]:
# check again missing values in here
movies.isnull().sum()

In [None]:
# check no of colomns and rows (without missing values)
movies.shape

Remove duplicate values

In [None]:
# check duplicated movies in here
movies.duplicated().sum()

Covert Genres

In [None]:
# get first genres
movies.iloc[0]['genres']

In [None]:
# check type of genres
type(movies.iloc[0]['genres'])

In [None]:
import ast # it can convert string to list

# create convert function
def convert(text):
    l = []
    for i in ast.literal_eval(text):
        l.append(i['name'])
        
    return l


In [None]:
# apply the function of genres
movies['genres'] = movies['genres'].apply(convert)

In [None]:
# view first 2 rows of movies
movies.head(2)

Covert Keywords

In [None]:
# get first keywords
movies.iloc[0]['keywords']

In [None]:
# apply the function of keywords
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
# view first 2 rows of movies
movies.head(2)

Convert Cast

In [None]:
# get first cast
movies.iloc[0]['cast']

In [None]:
# create convert_cast function
def convert_cast(text):
    l = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            l.append(i['name'])
        counter+=1
    return l

In [None]:
# apply the function of cast
movies['cast'] = movies['cast'].apply(convert_cast)

In [None]:
# view first 2 rows of movies
movies.head(2)

Covert Crew

In [None]:
# get first crew
movies.iloc[0]['crew']

In [None]:
# create fetch_directory function
def fetch_directory(text):
    l = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
        
    return l

In [None]:
# apply the function of crew
movies['crew'] = movies['crew'].apply(fetch_directory)

In [None]:
# view first 2 rows of movies
movies.head(2)

Data spliting

In [None]:
# get first overview
movies.iloc[0]['overview']

In [None]:
#split data from the 'overview' column
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
# view first 2 rows of movies
movies.head(2)

In [None]:
# get first overview
movies.iloc[0]['overview']

In [None]:
# view first few rows
movies.head()

Remove Spaces 

In [None]:
# create remove_space function
def remove_space(word):
    l = []
    for i in word:
        l.append(i.replace(" ",""))
    return l

In [None]:
# call remove_space function
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [None]:
# view first few rows
movies.head()

In [None]:
#Concatenate the columns & create a new column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
# view first few rows
movies.head()

In [None]:
movies.iloc[0]['tags']

In [None]:
# create new data frame
new_df = movies[['movie_id', 'title', 'tags', 'vote_count']]

In [None]:
# view fist few rows of new data frame
new_df.head()

In [None]:
# converts tage coloumn to the string format
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(map(str, x)) if isinstance(x, list) else str(x))

In [None]:
# view first few rows
new_df.head()

In [None]:
# access the tags colomn in the new_df data frame
new_df.iloc[0]['tags']

In [None]:
# convert the tags colomn to the lowercase
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [None]:
# view first few rows
new_df.head()

In [None]:
# again access the tags colomn in the new_df data frame
new_df.iloc[0]['tags']

In [None]:
# veiw first few rows of new_df
new_df.head()

# Generating Embeddings

In [None]:
# import libraries 
import nltk
from nltk.stem import PorterStemmer

In [None]:
# create an instance of the Porter Stemmer(tool of stemming words)
ps = PorterStemmer()

In [None]:
# create stem function
def stem(text):
    l = []
    for i in text.split():
        l.append(ps.stem(i))
        
    return " ".join(l)

In [None]:
# applying stemming function
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
# access the tags colomn in the new_df data frame
new_df.iloc[0]['tags']

In [None]:
# call CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')  # create instanse 

In [None]:
# convert 'tags' into a numerical vector
vector = cv.fit_transform(new_df['tags']).toarray()

In [None]:
# call the vector
vector

In [None]:
# representing the number of rows and columns in the array
vector.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# calculate the cosine similarity between vectors
similary = cosine_similarity(vector)

In [None]:
similary

In [None]:
# returns a tuple of (number of rows, number of columns)
similary.shape

In [None]:
# retrieve the index of the row
new_df[new_df['title'] == 'Spider-Man'].index[0]