## Content Based Recommender

### 1. Importing Modules

In [1]:
import pandas as pd
import numpy as np

### 2. Loading Dataset

In [5]:
movies = pd.read_csv('data/movies.csv')
credits = pd.read_csv('data/credits.csv')

In [7]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [9]:
movies.shape

(4803, 20)

In [8]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [10]:
credits.shape

(4803, 4)

### 3. Working Dataframe

In [11]:
df = movies.merge(credits, on='title')

In [12]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [13]:
df.shape

(4809, 23)

In [14]:
# genres id keywords overview title cast crew

df = df[['genres','id','keywords','overview','title','cast','crew']]

In [16]:
df.head()

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### 4. Preprocessing Steps

In [17]:
df.isnull().sum()

genres      0
id          0
keywords    0
overview    3
title       0
cast        0
crew        0
dtype: int64

In [18]:
df.dropna(inplace=True)

In [20]:
df.duplicated().sum()

0

In [23]:
import ast

def convert(obj):
    L = []
    # obj are in string format
    for i in ast.literal_eval(obj):
        L.append(i['name'])
        
    return L

In [26]:
df['genres'] = df['genres'].apply(convert)

In [29]:
df['keywords'] = df['keywords'].apply(convert)

In [45]:
def convert3(obj):
    L = []
    counter = 0
    # obj are in string format
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else :
            break
        
    return L

In [48]:
df['cast'] = df['cast'].apply(convert3)

In [50]:
def fetch_director(obj):
    L = []
    # obj are in string format
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
        
    return L

In [52]:
df['crew'] = df['crew'].apply(fetch_director)

In [53]:
df['overview'] = df['overview'].apply(lambda x:x.split())

In [55]:
# Removing Spaces between Words

df['genres'] = df['genres'].apply(lambda x: [i.replace(" ","") for i in x])
df['keywords'] = df['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
df['cast'] = df['cast'].apply(lambda x: [i.replace(" ","") for i in x])
df['crew'] = df['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [58]:
df.head(1)

Unnamed: 0,genres,id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [59]:
df['tags'] = df['crew'] + df['cast'] + df['genres'] + df['keywords'] + df['overview']

In [62]:
new_df = df[['id','title','tags']]

In [69]:
new_df['tags'].iloc[0]

['JamesCameron',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.']

In [73]:
# Saving our new dataframe
new_df.to_csv('data/new.csv', index=False)

In [2]:
new_df = pd.read_csv('data/new.csv')

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_tags(text):
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Stem the tokens
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]

    # Join the stemmed tokens into a string
    stemmed_text = ' '.join(stemmed_tokens)

    return stemmed_text


In [6]:
new_df['tags'] = new_df['tags'].apply(preprocess_tags)

### 5. Vectorization of Tags

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [8]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [9]:
vectors.shape

(4806, 5000)

In [10]:
for x in cv.get_feature_names_out():
    print(x)

007
10
100
10yearold
11
12
12yearold
13
14
15
16
16yearold
17
17yearold
18
18th
18thcenturi
1910
1920
1930
1940
1950
1960
1970
1974
1976
1980
1985
1990
1999
19th
19thcenturi
20
2000
2003
2009
20th
24
25
30
300
3d
40
50
60
70
911
aaron
aaroneckhart
abandon
abbi
abduct
abigailbreslin
abil
abl
aboard
aborigin
absenc
abus
academ
academi
accept
access
accid
accident
acclaim
accompani
accomplish
account
accus
ace
achiev
acquaint
act
action
actionhero
actionpack
activ
activist
actor
actress
actual
ad
adam
billi
billionair
billmurray
billnighi
billpaxton
billpullman
billybobthornton
billycrudup
billycryst
biographi
biolog
bird
birth
birthday
bisexu
bishop
bit
bite
bitter
bizarr
black
blackandwhit
blackmag
blackmail
blackpeopl
blacksmith
blade
blame
blend
blind
bliss
blizzard
block
blond
blood
bloodi
bloodsplatt
bloodthirsti
blossom
blow
blue
bo
board
boat
bob
bobbi
bobbyfarrelli
bobhoskin
bodi
bodyguard
bold
bollywood
bomb
bond
bone
book
boot
border
bore
boredom
born
borrow
boss
boston
botch
b

In [11]:
new_df['tags'].iloc[0]

'jamescameron samworthington zoesaldana sigourneyweav action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d 22nd centuri parapleg marin dispatch moon pandora uniqu mission becom torn follow order protect alien civil'

### 6. Calculating distance 

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
similarity = cosine_similarity(vectors)

In [14]:
similarity[0]

array([1.        , 0.0860309 , 0.08471737, ..., 0.0656218 , 0.02533473,
       0.        ])

In [15]:
# 1 means closet similar movie

### 7. Movie Recommendation

In [21]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    print(movies_list)
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [29]:
recommend('Avatar')

[(2409, 0.26310068027921696), (539, 0.25038669783359574), (507, 0.24722569302909875), (1204, 0.24455799402225925), (1216, 0.2435123110112404)]
Aliens
Titan A.E.
Independence Day
Predators
Aliens vs Predator: Requiem


In [25]:
import pickle

In [28]:
pickle.dump(new_df.to_dict(),open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [30]:
type(similarity)

numpy.ndarray