In [162]:
import pandas as pd
import numpy as np

In [163]:
movie_df = pd.read_csv('./dataclean/movie_df.csv')
movie_df.columns

Index(['movie_id', 'title', 'release_date', 'popularity', 'budget', 'revenue',
       'vote_average', 'vote_count', 'rating_avg', 'rating_count',
       'release_year'],
      dtype='object')

In [164]:
movie_df.sort_values(by= 'vote_count', ascending= False).head(10)

Unnamed: 0,movie_id,title,release_date,popularity,budget,revenue,vote_average,vote_count,rating_avg,rating_count,release_year
3604,27205,Inception,2010-07-14,29.108149,160000000,825532800.0,8.1,14075.0,3.7,5,2010
3123,155,The Dark Knight,2008-07-16,123.167259,185000000,1004558000.0,8.3,12269.0,3.4,1319,2008
4825,157336,Interstellar,2014-11-05,32.213481,165000000,675120000.0,8.1,11187.0,2.5,4,2014
4953,118340,Guardians of the Galaxy,2014-07-30,53.291601,170000000,773328600.0,7.9,10014.0,4.0,1,2014
839,550,Fight Club,1999-10-15,63.869599,63000000,100853800.0,8.3,9678.0,3.08,3477,1999
729,603,The Matrix,1999-03-30,33.366332,63000000,463517400.0,7.9,9079.0,2.96,622,1999
3146,1726,Iron Man,2008-04-30,22.073099,140000000,585174200.0,7.4,8951.0,2.96,2477,2008
1339,120,The Lord of the Rings: The Fellowship of the Ring,2001-12-18,32.070725,93000000,871368400.0,8.0,8892.0,2.88,166,2001
80,680,Pulp Fiction,1994-09-10,140.950236,8000000,213928800.0,8.3,8670.0,3.55,1246,1994
88,278,The Shawshank Redemption,1994-09-23,51.645403,25000000,28341470.0,8.5,8358.0,3.02,1178,1994


We can see that there are many popular movies in this list, let's check if our users agree

In [165]:
movie_df.sort_values(by= 'rating_count', ascending= False).head(10)

Unnamed: 0,movie_id,title,release_date,popularity,budget,revenue,vote_average,vote_count,rating_avg,rating_count,release_year
1145,318,The Million Dollar Hotel,2000-02-09,4.938231,8000000,0.0,5.9,76.0,4.43,91082,2000
1661,296,Terminator 3: Rise of the Machines,2003-07-02,20.818907,200000000,435000000.0,5.9,2177.0,4.17,87901,2003
974,593,Солярис,1972-03-20,11.059785,0,0.0,7.7,364.0,4.15,84078,1972
256,260,The 39 Steps,1935-06-01,5.865697,0,0.0,7.4,217.0,4.13,77045,1935
1374,480,Monsoon Wedding,2001-08-30,10.625504,0,0.0,6.8,59.0,3.66,74355,2001
76,527,Once Were Warriors,1994-09-02,4.025276,0,2201126.0,7.6,106.0,4.27,67662,1994
82,110,Trois couleurs : Rouge,1994-05-27,7.832755,0,0.0,7.8,246.0,4.02,66512,1994
2941,2959,License to Wed,2007-07-04,7.102076,35000000,69307224.0,5.3,258.0,4.23,60024,2007
1358,150,48 Hrs.,1982-12-07,15.297121,12000000,78868508.0,6.5,364.0,3.88,57416,1982
1773,780,La passion de Jeanne d'Arc,1928-04-21,8.903953,0,0.0,8.2,159.0,3.4,57232,1928


We cannot see much of any famous movies here! Looks like the data is not the genuine stuff that generated the original API statistics, the data provided in this dataset is not leaning towards the mainstream audience  
It is fortunate that ALS works well with sparse data distributions, now the challenge introduces itself to recommend movies for a user with one rating or with one search or fewer  
We can use the clean data we have to generate recommendations based on movies similarities between movies

In [166]:
directors = pd.read_csv('./dataclean/directors.csv')
directors.head()

Unnamed: 0,movie_id,name
0,862,John Lasseter
1,8844,Joe Johnston
2,949,Michael Mann
3,710,Martin Campbell
4,1408,Renny Harlin


As established earlier some movies have multiple directors we need that grouped together

In [167]:
directors_gp = directors.groupby('movie_id').name.apply(list).reset_index(name='directors')
print(directors_gp.shape)
directors_gp.head()

(7429, 2)


Unnamed: 0,movie_id,directors
0,2,[Aki Kaurismäki]
1,3,[Aki Kaurismäki]
2,5,"[Allison Anders, Alexandre Rockwell, Robert Ro..."
3,6,[Stephen Hopkins]
4,11,[George Lucas]


In [168]:
cast = pd.read_csv('./dataclean/actors.csv')
cast.head()

Unnamed: 0,movie_id,actor_id,name,order,gender
0,862,31,Tom Hanks,0,2
1,862,12898,Tim Allen,1,2
2,862,7167,Don Rickles,2,2
3,862,12899,Jim Varney,3,2
4,862,12900,Wallace Shawn,4,2


Let's explore the cast 

In [169]:
# stats over the whole dataframe
cast.order.describe()

count    111222.000000
mean         14.295112
std          19.543161
min           0.000000
25%           4.000000
50%           8.000000
75%          17.000000
max         313.000000
Name: order, dtype: float64

50% of the values are below 8, a staggering movie has over 300 credits, let's look further into this 

In [170]:
cast.groupby('movie_id').order.describe()['max'].describe()

count    7246.000000
mean       14.623654
std        14.713386
min         0.000000
25%         7.000000
50%        11.000000
75%        17.000000
max       313.000000
Name: max, dtype: float64

As seen in stats, some movies have up to 300 actors credited, yet 50% of the movies have less than 11 actors creditted, now I want to focus on the money makers the drawers to watching the movie, so I am going to cut short to 8 actors.

In [171]:
cast = cast[cast.order < 8]
cast.order.describe()

count    51306.000000
mean         3.311153
std          2.253398
min          0.000000
25%          1.000000
50%          3.000000
75%          5.000000
max          7.000000
Name: order, dtype: float64

In [172]:
cast_gp = cast.groupby('movie_id').name.apply(list).reset_index(name='cast')
print(cast_gp.shape)
cast_gp.head()

(7241, 2)


Unnamed: 0,movie_id,cast
0,2,"[Turo Pajala, Susanna Haavisto, Matti Pellonpä..."
1,3,"[Matti Pellonpää, Kati Outinen, Sakari Kuosman..."
2,5,"[Tim Roth, Antonio Banderas, Jennifer Beals, M..."
3,6,"[Emilio Estevez, Cuba Gooding Jr., Denis Leary..."
4,11,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe..."


In [173]:
keywords = pd.read_csv('./dataclean/keywords.csv')
keywords.head()

Unnamed: 0,movie_id,keyword_id,keyword
0,862,931,jealousy
1,862,4290,toy
2,862,5202,boy
3,862,6054,friendship
4,862,9713,friends


In [174]:
keywords_gp = keywords.groupby('movie_id').keyword.apply(list).reset_index(name='keywords')
print(keywords_gp.shape)
keywords_gp.head()

(5687, 2)


Unnamed: 0,movie_id,keywords
0,2,"[underdog, prison, factory worker, prisoner, h..."
1,3,"[salesclerk, helsinki, garbage, independent film]"
2,5,"[hotel, new year's eve, witch, bet, hotel room..."
3,6,"[chicago, drug dealer, boxing match, escape, o..."
4,11,"[android, galaxy, hermit, death star, lightsab..."


In [175]:
genres = pd.read_csv('./dataclean/genres.csv')
genres.head()

Unnamed: 0,movie_id,genre_id,genre
0,862,16,Animation
1,862,35,Comedy
2,862,10751,Family
3,8844,12,Adventure
4,8844,14,Fantasy


In [176]:
genres_gp = genres.groupby('movie_id').genre.apply(list).reset_index(name= 'genres')
print(genres_gp.shape)
genres_gp.head()

(7143, 2)


Unnamed: 0,movie_id,genres
0,2,"[Drama, Crime]"
1,3,"[Drama, Comedy]"
2,5,"[Crime, Comedy]"
3,6,"[Action, Thriller, Crime]"
4,11,"[Adventure, Action, Science Fiction]"


In [177]:
genres_gp.to_csv('./dataclean/genres_group.csv')

In [178]:
movies_gp = pd.merge(cast_gp, directors_gp, on= 'movie_id', how= 'outer')
print(movies_gp.shape)
movies_gp.head()

(7474, 3)


Unnamed: 0,movie_id,cast,directors
0,2,"[Turo Pajala, Susanna Haavisto, Matti Pellonpä...",[Aki Kaurismäki]
1,3,"[Matti Pellonpää, Kati Outinen, Sakari Kuosman...",[Aki Kaurismäki]
2,5,"[Tim Roth, Antonio Banderas, Jennifer Beals, M...","[Allison Anders, Alexandre Rockwell, Robert Ro..."
3,6,"[Emilio Estevez, Cuba Gooding Jr., Denis Leary...",[Stephen Hopkins]
4,11,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe...",[George Lucas]


In [179]:
temp = pd.merge(keywords_gp, genres_gp, on= 'movie_id', how= 'left')
temp.head()

Unnamed: 0,movie_id,keywords,genres
0,2,"[underdog, prison, factory worker, prisoner, h...","[Drama, Crime]"
1,3,"[salesclerk, helsinki, garbage, independent film]","[Drama, Comedy]"
2,5,"[hotel, new year's eve, witch, bet, hotel room...","[Crime, Comedy]"
3,6,"[chicago, drug dealer, boxing match, escape, o...","[Action, Thriller, Crime]"
4,11,"[android, galaxy, hermit, death star, lightsab...","[Adventure, Action, Science Fiction]"


In [180]:
temp = pd.merge(temp, movie_df[['movie_id', 'title', 'vote_average']], on='movie_id', how='outer')
temp.head()

Unnamed: 0,movie_id,keywords,genres,title,vote_average
0,2,"[underdog, prison, factory worker, prisoner, h...","[Drama, Crime]",Ariel,7.1
1,3,"[salesclerk, helsinki, garbage, independent film]","[Drama, Comedy]",Varjoja paratiisissa,7.1
2,5,"[hotel, new year's eve, witch, bet, hotel room...","[Crime, Comedy]",Four Rooms,6.5
3,6,"[chicago, drug dealer, boxing match, escape, o...","[Action, Thriller, Crime]",Judgment Night,6.4
4,11,"[android, galaxy, hermit, death star, lightsab...","[Adventure, Action, Science Fiction]",Star Wars,8.1


In [181]:
movies_gp = pd.merge(movies_gp, temp, on= 'movie_id', how= 'outer')
movies_gp.head()

Unnamed: 0,movie_id,cast,directors,keywords,genres,title,vote_average
0,2,"[Turo Pajala, Susanna Haavisto, Matti Pellonpä...",[Aki Kaurismäki],"[underdog, prison, factory worker, prisoner, h...","[Drama, Crime]",Ariel,7.1
1,3,"[Matti Pellonpää, Kati Outinen, Sakari Kuosman...",[Aki Kaurismäki],"[salesclerk, helsinki, garbage, independent film]","[Drama, Comedy]",Varjoja paratiisissa,7.1
2,5,"[Tim Roth, Antonio Banderas, Jennifer Beals, M...","[Allison Anders, Alexandre Rockwell, Robert Ro...","[hotel, new year's eve, witch, bet, hotel room...","[Crime, Comedy]",Four Rooms,6.5
3,6,"[Emilio Estevez, Cuba Gooding Jr., Denis Leary...",[Stephen Hopkins],"[chicago, drug dealer, boxing match, escape, o...","[Action, Thriller, Crime]",Judgment Night,6.4
4,11,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe...",[George Lucas],"[android, galaxy, hermit, death star, lightsab...","[Adventure, Action, Science Fiction]",Star Wars,8.1


In [182]:
movies_gp.isna().sum()

movie_id           0
cast             260
directors         72
keywords        1814
genres          1887
title              6
vote_average       6
dtype: int64

how='outer': This parameter specifies the type of join to perform. In this case, an outer join is used. An outer join returns all rows from both DataFrames, filling in missing values with NaN where the data is absent in either DataFrame.

how='left': This parameter specifies the type of join to perform. In this case, a left join is used. A left join returns all rows from the left DataFrame (cast_gp) and the matched rows from the right DataFrame (directors_gp). If a row in the left DataFrame doesn't have a match in the right DataFrame, the resulting row will contain NaN values for the columns from the right DataFrame.

In [183]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    # should leave nulls alone
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]

In [184]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'directors', 'genres']

for feature in features:
    movies_gp[feature] = movies_gp[feature].apply(clean_data)

In [185]:
movies_gp.head(2)

Unnamed: 0,movie_id,cast,directors,keywords,genres,title,vote_average
0,2,"[turopajala, susannahaavisto, mattipellonpää, ...",[akikaurismäki],"[underdog, prison, factoryworker, prisoner, he...","[drama, crime]",Ariel,7.1
1,3,"[mattipellonpää, katioutinen, sakarikuosmanen,...",[akikaurismäki],"[salesclerk, helsinki, garbage, independentfilm]","[drama, comedy]",Varjoja paratiisissa,7.1


In [186]:
# Eliminate NA values for soup to work
features = ['cast', 'keywords', 'directors', 'genres']
for att in features:
    movies_gp[att] = movies_gp[att].fillna('')

In [187]:
movies_gp.isna().sum()

movie_id        0
cast            0
directors       0
keywords        0
genres          0
title           6
vote_average    6
dtype: int64

In [188]:
movies_gp.dropna(inplace= True)

In [189]:
# Function joins the words in each list 
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['directors']) + ' ' + ' '.join(x['genres'])

In [190]:
movies_gp['soup'] = movies_gp.apply(create_soup, axis= 1)

In [191]:
movies_gp.head(2)

Unnamed: 0,movie_id,cast,directors,keywords,genres,title,vote_average,soup
0,2,"[turopajala, susannahaavisto, mattipellonpää, ...",[akikaurismäki],"[underdog, prison, factoryworker, prisoner, he...","[drama, crime]",Ariel,7.1,underdog prison factoryworker prisoner helsink...
1,3,"[mattipellonpää, katioutinen, sakarikuosmanen,...",[akikaurismäki],"[salesclerk, helsinki, garbage, independentfilm]","[drama, comedy]",Varjoja paratiisissa,7.1,salesclerk helsinki garbage independentfilm ma...


In [192]:
movies_gp.vote_average = movies_gp.vote_average.astype(float)

Counting the frequency of each word in each movie

In [193]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies_gp['soup'])

In [194]:
count_matrix.shape

(7495, 43427)

There are nearly 40k unique features in these movies

Cosine similarity is the angle between two vectors, if two vectors have similar directions on a 2D grid, the angle between them gets smaller and the cosine of it nears 1

In [195]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [196]:
cosine_sim.shape

(7495, 7495)

In [197]:
# Getting a title of a movie 
movie_df[movie_df.title.str.contains('Toy')]

Unnamed: 0,movie_id,title,release_date,popularity,budget,revenue,vote_average,vote_count,rating_avg,rating_count,release_year
0,862,Toy Story,1995-10-30,21.946943,30000000,373554033.0,7.7,5415.0,3.6,374,1995
870,25898,Babes in Toyland,1934-12-14,7.768271,0,0.0,6.6,28.0,3.88,184,1934
3565,26962,Puppet Master vs Demonic Toys,2004-12-18,1.945492,0,0.0,2.7,12.0,3.08,6,2004
3655,47670,The Christmas Toy,1986-12-06,0.923952,0,0.0,4.8,7.0,3.64,45,1986
3769,108869,Toys in the Attic,1963-07-31,0.925414,0,0.0,7.5,2.0,2.17,3,1963
4751,84152,Dollman vs. Demonic Toys,1993-10-13,1.05016,0,0.0,4.2,12.0,3.85,8025,1993
5215,70984,"Silent Night, Deadly Night 5: The Toy Maker",1991-11-06,1.240357,0,0.0,5.1,11.0,3.29,328,1991


In [198]:
# Construct a reverse map of indices and movie titles
movie_bytitle = pd.Series(movies_gp.index, index=movies_gp['title'])

In [199]:
def get_similar_movies(title, n= 10):
    idx = movie_bytitle[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_gp['title'].iloc[movie_indices]

In [200]:
get_similar_movies('Batman Begins')

111                          The Dark Knight
316                           Batman & Robin
3608    3 Ninjas: High Noon at Mega Mountain
212                                   Batman
5537            Teenage Mutant Ninja Turtles
285                           Batman Returns
1019                          State of Grace
2832                                  Wanted
2977                             Harry Brown
3963                           Hero at Large
Name: title, dtype: object

The movies similarites are not decisive enough to get recommendations, I think the quality of the movie should get into play

In [201]:
def get_recommendation(title, n= 10):
    idx = movie_bytitle[title]

    sim_scores = np.array(cosine_sim[idx])
    ratings = np.array(movies_gp.vote_average)/10

    f_score = (ratings * sim_scores) / (ratings + sim_scores)

    f_score = list(enumerate(f_score))
    # Sort the movies based on the similarity scores
    f_score = sorted(f_score, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    f_score = f_score[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in f_score]

    # Return the top 10 most similar movies
    return movies_gp['title'].iloc[movie_indices]
    

In [202]:
get_recommendation('Batman Begins')

  f_score = (ratings * sim_scores) / (ratings + sim_scores)


111     The Dark Knight
316      Batman & Robin
212              Batman
285      Batman Returns
1019     State of Grace
3720             仁義なき戦い
193         The Killing
1172           Superman
803        The Prestige
315      Batman Forever
Name: title, dtype: object

In [203]:
get_recommendation('Inception')

  f_score = (ratings * sim_scores) / (ratings + sim_scores)


145     Star Trek: Generations
68                    Predator
905    Blood: The Last Vampire
152    Star Trek: Insurrection
59                        Tron
448                    Солярис
803               The Prestige
136            Minority Report
913                        綠草地
997                 The Island
Name: title, dtype: object