In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Read movie data from CSV files

movies = pd.read_csv('dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('dataset/tmdb_5000_credits.csv')

### Merge movie and credit data on the movie title

In [None]:
movies =movies.merge(credits,on ='title')

# Select relevant columns

movies= movies[['movie_id','title','overview','keywords','genres','cast','crew']]
# Remove rows with missing values
movies = movies.dropna()

### Convert string representation of lists to actual lists

In [4]:
import ast

def convert(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
        
    return l

### Apply conversion to 'genres', 'keywords', and 'cast' columns

In [5]:
movies['genres'] =movies['genres'].apply(convert)
movies['keywords'] =movies['keywords'].apply(convert)
movies['cast'] =movies['cast'].apply(convert)

### Function to fetch the director from the crew list

In [6]:
def fetch_director(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job']=='Director':
            l.append(i['name'])
            break
        
    return l

### Apply the function to the 'crew' column

In [7]:
movies['crew'] =movies['crew'].apply(fetch_director)

### Split the 'overview' column into a list of words

In [None]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

### Function to remove spaces from words in a list

In [8]:
def remove_spc(word):
    l=[]
    for i in word:
        l.append(i.replace(" ",""))
    return l

### Remove spaces from 'cast', 'crew', 'keywords', and 'genres' columns

In [9]:
movies['cast']=movies['cast'].apply(remove_spc)
movies['crew']=movies['crew'].apply(remove_spc)
movies['keywords']=movies['keywords'].apply(remove_spc)
movies['genres']=movies['genres'].apply(remove_spc)

### Combine 'overview', 'keywords', 'genres', 'cast', and 'crew' columns into 'tags' column

In [10]:
movies['tags']= movies['overview'] + movies['keywords'] + movies['genres'] + movies['cast']+ movies['crew']

# Select relevant columns for the new DataFrame
new_df = movies[['movie_id','title','tags']]

### Function to lemmatize text

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(word) for word in text]

### Apply lemmatization to the 'tags' column

In [12]:
movies['tags'] = movies['tags'].apply(lemmatize_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


### Convert words in 'tags' back to a single string

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

### Convert tags to lowercase

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

### Function to lemmatize text in each tag

In [13]:
def stems(text):
    l = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(l)


### Apply lemmatization to 'tags' column in the new DataFrame

In [14]:
new_df['tags'] = new_df['tags'].apply(stems)
new_df.iloc[0]['tags']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stems)


'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following order and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d action adventure fantasy sciencefiction samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez giovanniribisi joeldavidmoore cchpounder wesstudi lazalonso dileeprao mattgerald seananthonymoran jasonwhyte scottlawrence kellykilgour jamespatrickpitt seanpatrickmurphy peterdillon kevindorman kelsonhenderson davidvanhorn jacobtomuri michaelblain-rozgay joncurry lukehawker woodyschultz petermensah soniayee jahnelcurfman ilramchoi kylawarren lisaroumain debrawilson chrismala taylorkibby jodielandau julielamm cullenb.madden josephbradymadden frankietorres austinwilson sarawilson tamicawashington-miller lucybriant nathanme

### Get TF-IDF vectors for the 'tags' column

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(new_df['tags']).toarray()

### Calculate cosine similarity

In [None]:
similarity_tfidf = cosine_similarity(tfidf_matrix)

In [16]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity_tfidf[index])),reverse=True,key = lambda x:x[1])
    for i in distances[1:10]:
        print(new_df.iloc[i[0]].title)

In [17]:
recommend('Batman')

Batman
Batman & Robin
Batman
Batman Returns
The Dark Knight Rises
Batman Forever
Batman Begins
Batman v Superman: Dawn of Justice
The Dark Knight
