<a href="https://colab.research.google.com/github/ShuyanCao/Ass2_Jianyu.Zhang_Shuyan.Cao/blob/master/ass.2/python/CB_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare Datasets

In [0]:
import pandas as pd
import numpy as np
#loading the movie dataset
movies = pd.read_csv('/content/tmdb_5000_movies.csv')

In [0]:
#loading the credit dataset
credits = pd.read_csv('/content/tmdb_5000_credits.csv')
credits.columns = ['id', 'title', 'cast', 'crew']

In [0]:
#combine the 2 datasets together 
movies = movies.merge(credits, on = 'id')


# Cleaning Data and Add Features 


In [0]:
import warnings
warnings.filterwarnings('ignore')

# inlude the necessary columns from the dataset. 
data = movies[['id', 'original_title', 'genres', 'keywords', 'overview',  'cast', 'crew']]

# define the features in the datasets
from ast import literal_eval
features = ['keywords','genres', 'cast', 'crew']
for i in features:
    data[i] = movies[i].apply(literal_eval)
    
 # Extract list of genres
def list_genres(x):
    list = [d['name'] for d in x]
    return(list)
data['genres'] = data['genres'].apply(list_genres)

# Extract top 3 cast members
def list_cast(x):
    list = [d['name'] for d in x]
    if len(list) > 3:
        list = list[:3]
    return(list)
data['cast'] = data['cast'].apply(list_cast)

# Extract top 5 keywords
def list_keywords(x):
    list = [d['name'] for d in x]
    if len(list) > 5:
        list = list[:5]
    return(list)
data['keywords'] = data['keywords'].apply(list_keywords)

# Extract director
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
data['director'] = data['crew'].apply(get_director)

# Drop the column crew 
data = data.drop('crew', axis = 1)

# Clean features of the dataset
def clean_feat(x):
    if isinstance(x, list):
        return [i.lower().replace(" ","") for i in x]
    else:
        if isinstance(x, str):
            return x.lower().replace(" ", "")
        else:
            return ''
          

 #redefine the features in the data 
features = ['keywords', 'genres', 'cast', 'director']
for i in features:
    data[i] = data[i].apply(clean_feat)
    


In [0]:
data.head()

Unnamed: 0,id,original_title,genres,keywords,overview,cast,director
0,19995,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","In the 22nd century, a paraplegic Marine is di...","[samworthington, zoesaldana, sigourneyweaver]",jamescameron
1,285,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","Captain Barbossa, long believed to be dead, ha...","[johnnydepp, orlandobloom, keiraknightley]",goreverbinski
2,206647,Spectre,"[action, adventure, crime]","[spy, basedonnovel, secretagent, sequel, mi6]",A cryptic message from Bond’s past sends him o...,"[danielcraig, christophwaltz, léaseydoux]",sammendes
3,49026,The Dark Knight Rises,"[action, crime, drama, thriller]","[dccomics, crimefighter, terrorist, secretiden...",Following the death of District Attorney Harve...,"[christianbale, michaelcaine, garyoldman]",christophernolan
4,49529,John Carter,"[action, adventure, sciencefiction]","[basedonnovel, mars, medallion, spacetravel, p...","John Carter is a war-weary, former military ca...","[taylorkitsch, lynncollins, samanthamorton]",andrewstanton


In [0]:
missing = data.columns[data.isnull().any()]
data[missing].isnull().sum().to_frame()

Unnamed: 0,0
overview,3


In [0]:
# Replace NaN from overview with an empty string
data['overview'] = data['overview'].fillna('')

In [0]:

! pip install rake_nltk
from rake_nltk import Rake
# Initialize empty column
data['createtags'] = ''

# function to get keywords from a text
def get_keywords(x):
    plot = x
    
    # initialize Rake using english stopwords from NLTK, and all punctuation characters
    rake = Rake()
    
    # extract keywords from text
    rake.extract_keywords_from_text(plot)
    
    # get dictionary with keywords and scores
    scores = rake.get_word_degrees()
    
    # return new keywords as list, ignoring scores
    return(list(scores.keys()))

# Apply function to generate keywords
data['createtags'] = data['overview'].apply(get_keywords)



In [0]:

data_keys = pd.DataFrame() 

data_keys['title'] = data['original_title']
data_keys['keywords'] = ''

def  tags_package(x):
    return(' '.join(x['genres']) + ' ,' + ' '.join(x['keywords']) + ' ,' +  ' '.join(x['cast']) + 
           ' ,' + ' '.join(x['director']) + ' ,' + ' '.join(x['createtags']))
data_keys['keywords'] = data.apply(tags_package, axis = 1)

data_keys.head()

Unnamed: 0,title,keywords
0,Avatar,"action adventure fantasy sciencefiction ,cultu..."
1,Pirates of the Caribbean: At World's End,"adventure fantasy action ,ocean drugabuse exot..."
2,Spectre,"action adventure crime ,spy basedonnovel secre..."
3,The Dark Knight Rises,"action crime drama thriller ,dccomics crimefig..."
4,John Carter,"action adventure sciencefiction ,basedonnovel ..."


In [0]:
from sklearn.feature_extraction.text import CountVectorizer

# create count matrix
cv = CountVectorizer()
cv_mx = cv.fit_transform(data_keys['keywords'])


from sklearn.metrics.pairwise import cosine_similarity
# create cosine similarity matrix
cosine_sim = cosine_similarity(cv_mx, cv_mx)

print(cosine_sim)

[[1.         0.10527936 0.06299408 ... 0.         0.         0.        ]
 [0.10527936 1.         0.09284767 ... 0.02438299 0.         0.        ]
 [0.06299408 0.09284767 1.         ... 0.02188441 0.         0.        ]
 ...
 [0.         0.02438299 0.02188441 ... 1.         0.05938557 0.03959038]
 [0.         0.         0.         ... 0.05938557 1.         0.06818182]
 [0.         0.         0.         ... 0.03959038 0.06818182 1.        ]]


In [0]:
# create list of indices for later matching
indices = pd.Series(data_keys.index, index = data_keys['title'])

# Result Testing(Recommendation)

In [0]:
def movies_recommendation(title, n = 10, cosine_sim = cosine_sim):
    movies = []
    
    # retrieve matching movie title index
    if title not in indices.index:
        print("No such movie found")
        return
    else:
        idx = indices[title]
    
    # cosine similarity scores of movies in descending order
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    # top n most similar movies indexes
    # use 1:n because 0 is the same movie entered
    top_n_idx = list(scores.iloc[1:n].index)
        
    return data_keys['title'].iloc[top_n_idx]

In [0]:
print(movies_recommendation('The Avengers' ,n=12))

7                   Avengers: Age of Ultron
174                     The Incredible Hulk
511                                   X-Men
85      Captain America: The Winter Soldier
79                               Iron Man 2
26               Captain America: Civil War
169      Captain America: The First Avenger
31                               Iron Man 3
68                                 Iron Man
4401                    The Helix... Loaded
182                                 Ant-Man
Name: title, dtype: object
