# Data preprocessing

---


In [47]:
INPUT_PATH = r'..\data\processed\prepared_features.pkl'

OUTPUT_PATH = r'..\data\processed\similarity_matrix.pkl'

# importing


In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import ast
import nltk
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, cosine_distances
from sklearn.preprocessing import StandardScaler

import joblib

In [3]:
# Fun
def stem(text):
    ps = PorterStemmer()
    list = []
    for word in text.split():
        list.append(ps.stem(word))

    return ' '.join(list)

In [4]:
movies = pd.read_pickle(INPUT_PATH)

In [5]:
movies.head()

Unnamed: 0,movie_id,title,release_year,runtime,vote_average,tags
0,19995,Avatar,2009,162.0,7.2,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,2007,169.0,6.9,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,2015,148.0,6.3,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,2012,165.0,7.6,following the death of district attorney harve...
4,49529,John Carter,2012,132.0,6.1,"john carter is a war-weary, former military ca..."


## Convert all words with the same meaning into one word


In [6]:
ps = PorterStemmer()
text = 'loving'
ps.stem(text)

'love'

In [7]:
test = 'love loved loving'
stem(test)

'love love love'

In [8]:
movies['tags'] = movies['tags'].apply(stem)

In [9]:
movies['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict samworthington zoesaldana sigourneyweav stephenlang cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d jamescameron'

## Make count vectorization in tags column


In [10]:
cv = CountVectorizer(max_features=5000, stop_words='english')
word_vector = cv.fit_transform(movies['tags']).toarray()

In [11]:
word_vector.shape

(4799, 5000)

In [12]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [13]:
# word_vector.max()

In [14]:
word_vector_df = pd.DataFrame(word_vector, columns=cv.get_feature_names_out())

In [15]:
word_vector_df

Unnamed: 0,000,007,10,100,11,12,13,14,15,16,...,zhangziyi,zion,zoe,zoesaldana,zombi,zombieapocalyps,zombies,zone,zoo,zooeydeschanel
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4794,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---


## Get movie details df


In [16]:
movies_dettails = movies[['release_year', 'runtime', 'vote_average']]

In [17]:
scaler = StandardScaler()
movies_dettails_scaled = scaler.fit_transform(movies_dettails)

In [18]:
movies_dettails_scaled.shape

(4799, 3)

---


# Measure similarity


### 1) for word vector


Since most of the matrices are 0s and 1s, we will use cosine similarity (or distance) as a means of measuring the similarity between the films.


In [19]:
similarity_word_m = cosine_distances(word_vector)
similarity_word_m

array([[0.        , 0.91773873, 0.91528263, ..., 0.95500787, 1.        ,
        1.        ],
       [0.91773873, 0.        , 0.94115101, ..., 0.97655964, 1.        ,
        0.97451764],
       [0.91528263, 0.94115101, 0.        , ..., 0.97585977, 1.        ,
        1.        ],
       ...,
       [0.95500787, 0.97655964, 0.97585977, ..., 0.        , 0.9581879 ,
        0.9581879 ],
       [1.        , 1.        , 1.        , ..., 0.9581879 , 0.        ,
        0.90909091],
       [1.        , 0.97451764, 1.        , ..., 0.9581879 , 0.90909091,
        0.        ]])

In [20]:
sim_word_m = pd.DataFrame(
    similarity_word_m, columns=movies['title'], index=movies['title'])
sim_word_m.head()

title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,0.0,0.917739,0.915283,0.926995,0.812683,0.891611,0.960064,0.854905,0.940765,0.904327,...,1.0,1.0,0.957761,0.947368,1.0,0.980748,0.953171,0.955008,1.0,1.0
Pirates of the Caribbean: At World's End,0.917739,0.0,0.941151,0.961965,0.926807,0.887062,0.979194,0.874012,0.938279,0.900311,...,1.0,1.0,0.977994,0.97258,1.0,0.959879,1.0,0.97656,1.0,0.974518
Spectre,0.915283,0.941151,0.0,0.941244,0.924622,0.930214,0.978573,0.87025,0.936436,0.897335,...,0.91556,1.0,1.0,1.0,0.982679,0.958682,1.0,0.97586,1.0,1.0
The Dark Knight Rises,0.926995,0.961965,0.941244,0.0,0.967522,0.939862,0.944605,0.932913,0.958918,0.800938,...,0.972713,0.972713,0.94141,0.945246,0.97761,0.933238,1.0,0.968796,0.949116,0.915193
John Carter,0.812683,0.926807,0.924622,0.967522,0.0,0.90356,0.9467,0.806351,0.920943,0.893593,...,0.964993,1.0,0.924835,0.976585,0.856379,0.845833,1.0,0.979984,0.912961,0.956481


In [43]:
sim_word_m['Avatar'].sort_values(ascending=True)

title
Avatar                                0.000000
Aliens vs Predator: Requiem           0.713230
Aliens                                0.736899
Independence Day                      0.742159
Falcon Rising                         0.746083
                                        ...   
National Lampoon's Loaded Weapon 1    1.000000
The New Guy                           1.000000
St. Vincent                           1.000000
Chloe                                 1.000000
My Date with Drew                     1.000000
Name: Avatar, Length: 4799, dtype: float64

---


## 2)For movies dettails


We will use euclidean distance as a measure of similarity here because we have continuous numbers


In [22]:
similarity_det_m = euclidean_distances(movies_dettails_scaled)
sim_det_m = pd.DataFrame(
    similarity_det_m, columns=movies['title'], index=movies['title'])

In [23]:
sim_det_m.head()

title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,0.0,0.431266,1.09201,0.435223,1.638271,1.504143,2.754682,1.052274,0.433008,1.466087,...,3.371754,7.025095,3.215676,3.797776,3.657899,3.875938,3.587957,1.896951,3.114592,3.296073
Pirates of the Caribbean: At World's End,0.431266,0.0,1.239716,0.735354,1.818139,1.57378,3.096907,1.438483,0.84028,1.47725,...,3.591054,6.982609,3.378432,4.085358,3.980664,4.091565,3.831077,2.226809,3.329825,3.541821
Spectre,1.09201,1.239716,0.0,1.350295,0.767971,0.829361,2.355128,0.896979,1.067607,0.528354,...,2.731166,6.060514,2.751758,3.308445,3.279492,3.509664,2.831296,1.383321,2.28602,2.694297
The Dark Knight Rises,0.435223,0.735354,1.350295,0.0,1.932234,1.880833,2.890745,1.119818,0.608029,1.745248,...,3.644403,7.378854,3.536898,3.997418,3.8108,4.143579,3.824843,2.05927,3.373098,3.545154
John Carter,1.638271,1.818139,0.767971,1.932234,0.0,0.535595,1.798644,1.112432,1.456701,0.962563,...,1.971976,5.591765,2.038181,2.610129,2.649452,2.80784,2.09177,0.929065,1.5443,1.952585


In [24]:
sim_det_m['Avatar'].sort_values(ascending=True)

title
Avatar                                 0.000000
Enter the Void                         0.044328
Watchmen                               0.174059
The Curious Case of Benjamin Button    0.212162
The Girl with the Dragon Tattoo        0.239581
                                         ...   
Dude Where's My Dog?                   9.404744
Light from the Darkroom                9.404744
Sharkskin                              9.408539
Hum To Mohabbat Karega                 9.424046
The Big Swap                           9.437809
Name: Avatar, Length: 4799, dtype: float64

---


# Now we need to combine the two matrices, with higher weight for word matrix


In [25]:
final_sim_m = sim_det_m + (7 * sim_word_m)

In [45]:
final_sim_m.head()

title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,On The Downlow,Sanctuary: Quite a Conundrum,Bang,Primer,Cavite,El Mariachi,Newlyweds,"Signed, Sealed, Delivered",Shanghai Calling,My Date with Drew
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,0.0,6.855437,7.498988,6.924186,7.327051,7.745422,9.475129,7.036607,7.018364,7.796376,...,10.371754,14.025095,9.920004,10.429355,10.657899,10.741173,10.260152,8.582007,10.114592,10.296073
Pirates of the Caribbean: At World's End,6.855437,0.0,7.827773,7.46911,8.305792,7.783211,9.951263,7.556566,7.408231,7.779427,...,10.591054,13.982609,10.22439,10.893415,10.980664,10.810721,10.831077,9.062726,10.329825,10.363444
Spectre,7.498988,7.827773,0.0,7.939005,7.240326,7.340857,9.205136,6.98873,7.622658,6.809696,...,9.140086,13.060514,9.751758,10.308445,10.158242,10.220435,9.831296,8.21434,9.28602,9.694297
The Dark Knight Rises,6.924186,7.46911,7.939005,0.0,8.704884,8.459863,9.502976,7.650207,7.320452,7.351813,...,10.453392,14.187843,10.126771,10.61414,10.654069,10.676247,10.824843,8.84084,10.01691,9.951507
John Carter,7.327051,8.305792,7.240326,8.704884,0.0,6.860518,8.425543,6.756888,7.903302,7.217713,...,8.726927,12.591765,8.512029,9.446226,8.644103,8.728667,9.09177,7.788953,7.935028,8.647949


In [34]:
final_sim_m['The Dark Knight Rises'].sort_values(ascending=True).head(10)

title
The Dark Knight Rises                  0.000000
The Dark Knight                        4.818996
Batman Begins                          6.055661
Prisoners                              6.497527
The Hateful Eight                      6.569987
Zero Dark Thirty                       6.715625
American Gangster                      6.727918
The Girl with the Dragon Tattoo        6.734661
Zodiac                                 6.805525
The Curious Case of Benjamin Button    6.919108
Name: The Dark Knight Rises, dtype: float64

---


### Test recommendation based on similarity matrix


In [40]:
def recommend(movie):
    return final_sim_m[movie].sort_values(ascending=True).head(10)

In [44]:
recommend('The Dark Knight Rises')

title
The Dark Knight Rises                  0.000000
The Dark Knight                        4.818996
Batman Begins                          6.055661
Prisoners                              6.497527
The Hateful Eight                      6.569987
Zero Dark Thirty                       6.715625
American Gangster                      6.727918
The Girl with the Dragon Tattoo        6.734661
Zodiac                                 6.805525
The Curious Case of Benjamin Button    6.919108
Name: The Dark Knight Rises, dtype: float64

---


# save similarity matrix


In [48]:
joblib.dump(final_sim_m, OUTPUT_PATH)

['..\\data\\processed\\similarity_matrix.pkl']