# Movie clustering

## imports

In [1]:
import pandas as pd # for data cleaning
import numpy as np # matrix/vector operations
import matplotlib.pyplot as plt # visualizations
from sklearn.cluster import KMeans # for kmeans

In [2]:
ls

Movie clustering.ipynb  movie_clusters.csv
movie.csv               [34mtest[m[m/


## upload data

In [3]:
m = pd.read_csv("movie.csv")
m.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [13]:
list(m[m['movie_title'].isin(['12 Angry Men'])].loc[:,'plot_keywords'])

['courtroom|dialogue driven|dialogue driven storyline|jury|murder']

In [20]:
m[m['movie_title'].isin(['Luther'])]['content_rating']

1485    TV-MA
Name: content_rating, dtype: object

## data cleaning

### split the genres and plot_keywords into lists by  "|". Then join them together into terms column. 

In [4]:
m['terms'] = m['genres'].str.split("|") +  m['plot_keywords'].str.split("|")

In [5]:
m.head()[['movie_title','terms']]

Unnamed: 0,movie_title,terms
0,Avatar,"[Action, Adventure, Fantasy, Sci-Fi, avatar, f..."
1,Pirates of the Caribbean: At World's End,"[Action, Adventure, Fantasy, goddess, marriage..."
2,Spectre,"[Action, Adventure, Thriller, bomb, espionage,..."
3,The Dark Knight Rises,"[Action, Thriller, deception, imprisonment, la..."
4,Star Wars: Episode VII - The Force Awakens,


### We convert the lists of terms into rows in three steps
1. set index to movie title, select the 'terms' column.
2. expand the lists so that each element is in its own column by applying pd.Series to each list in the terms column.
3. apply the stack method to stack stack the columns into one column and reset index to avoid a 'multi-level' index

In [6]:
movie_term = m.set_index(['movie_title'])['terms'].apply(pd.Series).stack().reset_index()
movie_term.head(10)

Unnamed: 0,movie_title,level_1,0
0,Avatar,0,Action
1,Avatar,1,Adventure
2,Avatar,2,Fantasy
3,Avatar,3,Sci-Fi
4,Avatar,4,avatar
5,Avatar,5,future
6,Avatar,6,marine
7,Avatar,7,native
8,Avatar,8,paraplegic
9,Pirates of the Caribbean: At World's End,0,Action


### fix column names

In [7]:
movie_term.columns = ['movie_title','term_number','terms']

In [8]:
movie_term.head()

Unnamed: 0,movie_title,term_number,terms
0,Avatar,0,Action
1,Avatar,1,Adventure
2,Avatar,2,Fantasy
3,Avatar,3,Sci-Fi
4,Avatar,4,avatar


## Some basic statistics

In [9]:
#number of terms
len(set(movie_term['terms']))

8108

In [10]:
#number of movies
len(set(movie_term['movie_title']))

4764

In [11]:
# terms and their counts
term_counts = movie_term['terms'].value_counts() 
term_counts.head()

Drama       2461
Comedy      1813
Thriller    1330
Action      1094
Romance     1068
Name: terms, dtype: int64

In [12]:
# number of terms that only apear once
len(term_counts[term_counts == 1])

5045

## define top terms 

There are over 8000 terms used as plot keywords. 5045 of these terms only apear once. We need to restrict to a subset of terms.

In [13]:
number_of_terms = 200
top_terms = list(movie_term['terms'].value_counts()[0:200].index)

In [14]:
movie_term = movie_term[movie_term['terms'].isin(top_terms)]

## count the occurences of each term for each movie.

In [15]:
# crosstab will count the number of times a term occurs for each movie.
# This gives us a matrix where each row is a movie and each column is a term

features = pd.crosstab(index=movie_term['movie_title'],columns=movie_term['terms'])
features.head()

terms,1950s,1960s,1970s,1980s,Action,Adventure,Animation,Biography,Comedy,Crime,...,two word title,undercover,vampire,village,violence,vomiting,warrior,wedding,writer,zombie
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#Horror,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10 Cloverfield Lane,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10 Things I Hate About You,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
102 Dalmatians,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10th & Wolf,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Tf-idf matrix

## Calculate Tf

Tf can be thought of as how important a term is to a movie.

$Tf(t) = ($Number of times term t appears in a movie$) / ($Total number of terms in the movie$)$

To calculate tf we will find the total number of terms for each movie by taking the sum of each row (axis=1) and dividing each column by that number. 

For example, if this is our feature matrix,

| Movie | term1 | term2 | term3 |
|-------|-------|-------|-------|
| m1    | 2     | 1     | 0     |
| m2    | 4     | 0     | 3     |
| m3    | 1     | 2     | 3     |


the sum of the rows is 

| total terms |
|-------|
| 3    |
| 7    | 
| 6    |

Now we divide this column by each column in the feature matrix

| Movie | term1 | term2 | term3 |
|-------|-------|-------|-------|
| m1    | 2/3     | 1/3     | 0/3     |
| m2    | 4/7     | 0/7     | 3/7     |
| m3    | 1/6     | 2/6     | 3/6     |

In [16]:
total_terms = features.sum(axis=1)
features_tf = features.div(total_terms, axis=0)

In [17]:
features_tf.head()

terms,1950s,1960s,1970s,1980s,Action,Adventure,Animation,Biography,Comedy,Crime,...,two word title,undercover,vampire,village,violence,vomiting,warrior,wedding,writer,zombie
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#Horror,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102 Dalmatians,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10th & Wolf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate idf

Idf (Inverse Document Frequency) is 

$IDF(t) = log_e($total number of movies$ /$ Number of movies containing term $t )$

first we calculate the number of movies containing each term by finding the sum of each column of the features matrix. Then we add a column with the total number of movies.

In [18]:
movies_for_each_term = pd.DataFrame(features.sum())
movies_for_each_term.head()

Unnamed: 0_level_0,0
terms,Unnamed: 1_level_1
1950s,22
1960s,24
1970s,21
1980s,24
Action,1094


In [19]:
movies_for_each_term['total_movies'] = len(features.index)
movies_for_each_term.head()

Unnamed: 0_level_0,0,total_movies
terms,Unnamed: 1_level_1,Unnamed: 2_level_1
1950s,22,4764
1960s,24,4764
1970s,21,4764
1980s,24,4764
Action,1094,4764


In [20]:
movies_for_each_term['idf'] = np.log(movies_for_each_term['total_movies']/movies_for_each_term[0])

In [21]:
idf = movies_for_each_term['idf']

In [22]:
idf.head(12)

terms
1950s          5.377800
1960s          5.290789
1970s          5.424320
1980s          5.290789
Action         1.471247
Adventure      1.691196
Animation      3.005011
Biography      2.809361
Comedy         0.966105
Crime          1.727142
Documentary    3.814883
Drama          0.660520
Name: idf, dtype: float64

## calculate tf-idf matrix

We have our tf matrix. For example,

| Movie | term1 | term2 | term3 |
|-------|-------|-------|-------|
| m1    | 2/3     | 1/3     | 0/3     |
| m2    | 4/7     | 0/7     | 3/7     |
| m3    | 1/6     | 2/6     | 3/6     |

and our idf

|term|idf|
|----|---|
|term1|5|
|term2|4|
|term3|3|

To find tf-idf we multiply our idf by each row of the tf


In [23]:
features_tf_idf = features_tf.multiply(idf)

In [24]:
features_tf_idf.head()

terms,1950s,1960s,1970s,1980s,Action,Adventure,Animation,Biography,Comedy,Crime,...,two word title,undercover,vampire,village,violence,vomiting,warrior,wedding,writer,zombie
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#Horror,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241526,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102 Dalmatians,0.0,0.0,0.0,0.0,0.0,0.338239,0.0,0.0,0.193221,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10th & Wolf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.345428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# cosine similarity matrix

The cosine similarity for movies $i$ and $j$ is defined as the cosine of the angle between their vectors.

$$cos(\theta) = \frac{i \cdot j}{|i||j|}$$

To calculate this from our tf-idf matrix we 

1. divide each row by its norm (square-root of the sum of the squares)
2. then when we take the product of the matrix and its transpose we get cosine similarity

In [25]:
matrix_rows_normed = features_tf_idf.div(np.sqrt(np.square(features_tf_idf).sum(axis=1)),axis=0)

In [26]:
cosine_similarity = matrix_rows_normed.dot(matrix_rows_normed.T)

In [27]:
cosine_similarity.head()

movie_title,#Horror,10 Cloverfield Lane,10 Things I Hate About You,102 Dalmatians,10th & Wolf,11:14,12 Angry Men,12 Monkeys,12 Rounds,12 Years a Slave,...,Zoolander,Zoolander 2,Zoom,Zulu,[Rec],[Rec] 2,eXistenZ,xXx,xXx: State of the Union,Æon Flux
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#Horror,1.0,0.259408,0.015904,0.0,0.048982,0.018401,0.018962,0.192613,0.038758,0.017092,...,0.0,0.0,0.0,0.055209,0.372371,0.105701,0.186038,0.04751,0.047637,0.0
10 Cloverfield Lane,0.259408,1.0,0.011795,0.0,0.036326,0.013646,0.014063,0.227123,0.028743,0.012676,...,0.0,0.0,0.086964,0.040944,0.276158,0.07839,0.230871,0.035235,0.035329,0.078492
10 Things I Hate About You,0.015904,0.011795,1.0,0.02928,0.013261,0.073993,0.024289,0.014509,0.0,0.021893,...,0.038325,0.208759,0.0,0.014947,0.0,0.0,0.0,0.0,0.0,0.0
102 Dalmatians,0.0,0.0,0.02928,1.0,0.0,0.033877,0.0,0.063905,0.0,0.0,...,0.025749,0.140256,0.178865,0.0,0.0,0.0,0.0,0.071834,0.072025,0.0
10th & Wolf,0.048982,0.036326,0.013261,0.0,1.0,0.120246,0.123915,0.044687,0.091533,0.014252,...,0.0,0.0,0.0,0.112557,0.0,0.0,0.038848,0.039615,0.112503,0.0


# apply K-means



in order to apply k-means we need to choose the number of clusters
also, k-means done not work with pandas data frames. we have to 
convert our data to a numpy array.

In [28]:
#import
from sklearn.cluster import KMeans

In [40]:
number_of_clusters = 40
kmeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(np.array(cosine_similarity))

### Kmeans will return a set of cluster labels for each movie

In [41]:
kmeans.labels_

array([ 5,  5, 38, ...,  1, 11, 37], dtype=int32)

In [42]:
features['cluster'] = kmeans.labels_

In [43]:
cluster = features[['cluster']]

In [44]:
cluster.head()

terms,cluster
movie_title,Unnamed: 1_level_1
#Horror,5
10 Cloverfield Lane,5
10 Things I Hate About You,38
102 Dalmatians,7
10th & Wolf,6


## now we merge the cluster labels with our original data set, m

In [45]:
m.set_index('movie_title',inplace=True)

KeyError: 'movie_title'

In [46]:
m['cluster'] = cluster

## Now we sort the movies by their cluster and imdb score
This is the data we can use for our movie recomendation system

In [47]:
clusters = m[['imdb_score','cluster']].sort_values(by=['cluster','imdb_score'],ascending=False)

In [49]:
clusters[clusters['cluster'] == 34]

Unnamed: 0_level_0,imdb_score,cluster
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
The Prestige,8.5,34.0
2001: A Space Odyssey,8.3,34.0
Blade Runner,8.2,34.0
Solaris,8.1,34.0
The Man from Earth,8.0,34.0
Moon,7.9,34.0
Watchmen,7.7,34.0
Minority Report,7.7,34.0
Dark City,7.7,34.0
Ex Machina,7.7,34.0


In [50]:
clusters.to_csv("movie_clusters.csv")