# Movie Recommendation Model

In [1]:
# import dependencies
import pandas as pd
import json

In [2]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')
df = credits.merge(movies, on='title')

In [3]:
# Remove unwated columns
df = df[['movie_id','title','overview','genres','keywords','cast','crew']]
df.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [4]:
# Function that maps through the json string and fetches the desired parameters
def myFunc(obj, parameter): 
    List = list(map(lambda y: str(y[parameter]).replace(' ', '_'), json.loads(obj)))
    return List

In [5]:
# Fetching names of cast, genre and keywords
df.cast = df.cast.apply(lambda x: myFunc(x, 'name')[:5])
df.genres = df.genres.apply(lambda x: myFunc(x, 'name'))
df.keywords = df.keywords.apply(lambda x: myFunc(x, 'name'))

In [6]:
# fetch names of writers and directors
def getCrew(obj):
    crewname = dict({'Director': [], 'Writer': []})
    crew_object = json.loads(obj)
    for crew in crew_object:
        if not crew['job'] in crewname.keys():
            continue
        crewname[crew['job']].append(str(crew['name']).replace(' ', '_'))
    return [crewname['Director'], crewname['Writer']]

In [7]:
crew = df.crew.apply(lambda x: getCrew(x))

df['Director'] = crew.apply(lambda x: list(set(x[0] + x[1])))

df.drop(['crew'], axis='columns', inplace=True)

In [8]:
df['overview'] = df.overview.apply(lambda x: str(x).split())

In [9]:
df.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,Director
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science_Fiction]","[culture_clash, future, space_war, space_colon...","[Sam_Worthington, Zoe_Saldana, Sigourney_Weave...",[James_Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug_abuse, exotic_island, east_india_...","[Johnny_Depp, Orlando_Bloom, Keira_Knightley, ...",[Gore_Verbinski]


In [10]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['Director']

In [11]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [12]:
# steming
df['tags'] = df['tags'].apply(lambda x: list(map(ps.stem, x)))

In [13]:
df['tags'] = df['tags'].apply(lambda x: ' '.join(x).lower())

In [14]:
new_df = df.drop(columns=['overview','genres','keywords', 'cast', 'Director'])
new_df.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."


In [15]:
# save the ouput
new_df.to_csv('output/movies_dataframe.csv')

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [17]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
similarity = cosine_similarity(vector)

In [20]:
def getMostRelated(obj):
    sortedlist = sorted(list(enumerate(obj)),reverse=True,key = lambda x: x[1])[1:31]
    return list(map(lambda x: x[0] ,sortedlist))

In [21]:
similarity = list(map(lambda x: getMostRelated(x) ,similarity.tolist()))

In [22]:
# recommendation function
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[index]
    print(f'{"Recommendations":.^41}')
    for i in distances[:6]:
        print(new_df.iloc[i].title)

## Recommendations

In [23]:
recommend('8 Mile')

.............Recommendations.............
Hustle & Flow
Tupac: Resurrection
The R.M.
You Got Served
Gran Torino
We Are Your Friends


### storing data 

In [24]:
import joblib

In [25]:
joblib.dump(similarity, 'output/similarity')

['output/similarity']