## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline

In [None]:
path = 'drive/MyDrive/ADM/movielens/'

## Loading Tag Data

In [None]:
# Load ratings data and tags data
tags_data = pd.read_csv(path + 'tags.csv')
print("Unique movies in tags.csv: {}".format(len(tags_data.movieId.unique())))
print("Shape of tags_data: {}".format(tags_data.shape))
ratings_data = pd.read_csv(path + 'ratings.csv')
ratings_data = ratings_data.drop_duplicates('movieId')
print("Unique movies in ratings.csv: {}".format(len(ratings_data.movieId.unique())))

19545 unique movies in tags.csv
the tags data has (465564, 4) shape
26744 unique movies in ratings.csv


## Preprocessing

In [None]:
# Movies to sequential and conitinuos index
movieID_unique = ratings_data.movieId.unique()
movie2index = {o:i for i,o in enumerate(movieID_unique)}

In [None]:
tags_data.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078
5,65,668,bollywood,1368149876
6,65,898,screwball comedy,1368150160
7,65,1248,noir thriller,1368149983
8,65,1391,mars,1368150055
9,65,1617,neo-noir,1368150217


In [None]:
# Map movie index for tags_data
tags_data = pd.merge(tags_data, ratings_data, on="movieId", how="right")
tags_data.movieId = tags_data.movieId.apply(lambda x: movie2index[x])
print("Unique movies in tags.csv: {}".format(len(tags_data.movieId.unique())))
print(tags_data.shape)
tags_data.head()

26744 unique movies in tags.csv
(472537, 7)


Unnamed: 0,userId_x,movieId,tag,timestamp_x,userId_y,rating,timestamp_y
0,1629.0,0,time travel,1394473000.0,1,3.5,1112486027
1,1741.0,0,adapted from:book,1178997000.0,1,3.5,1112486027
2,1741.0,0,board game,1182730000.0,1,3.5,1112486027
3,1741.0,0,childhood recaptured,1178997000.0,1,3.5,1112486027
4,1741.0,0,game,1178997000.0,1,3.5,1112486027


In [None]:
# Map movie index for movies data
movies_data = pd.read_csv(path + 'movies.csv')
print("Unique movies in movies.csv: {}".format(len(movies_data.movieId.unique())))
print("Unique movies in ratings_data.csv: {}".format(len(ratings_data.movieId.unique())))

movies_data = pd.merge(movies_data, ratings_data, on="movieId", how="inner")
movies_data.movieId = movies_data.movieId.apply(lambda x: movie2index[x])
print("Unique movies in movies.csv: {}".format(len(movies_data.movieId.unique())))

movies_data.set_index('movieId', inplace=True)
movies_data['genres'] = movies_data['genres'].str.replace(pat="|", repl=" ")
movies_data['genres'] = movies_data['genres'].str.replace(pat="-", repl="")
movies_data.query('title == "Primer (2004)"')

27278 unique movies in movies.csv
26744 unique movies in ratings.csv
26744 unique movies in movies.csv


Unnamed: 0_level_0,title,genres,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3006,Primer (2004),Drama SciFi,96,4.5,1402187423


In [None]:
# Create documents from tags
tags_data.fillna("", inplace=True)
tags_data = pd.DataFrame(tags_data.groupby('movieId')['tag'].apply(lambda x: "{%s}" % ' '.join(x)))
tags_data.reset_index(inplace=True)
movieID = tags_data.movieId
print("Unique movies: {}".format(len(movieID)))
tags_data.sample(10)

There are 26744 unique movies


Unnamed: 0,movieId,tag
1330,1330,{Can't remember Samuel L. Jackson Reginald Hud...
22121,22121,{}
3263,3263,{zombies campy gross out lawn mower zombies cu...
22937,22937,{}
15739,15739,{}
4124,4124,{Own adultery Bechdel Test:Fail comedy about w...
4389,4389,{Betamax}
25399,25399,{Israel Judaism Palestine religion}
15841,15841,{}
20942,20942,{}


In [None]:
# Add genres to document
tags_data = pd.merge(movies_data, tags_data, left_index=True, right_on='movieId', how='right')
tags_data['document'] = tags_data[['tag', 'genres']].apply(lambda x: ' '.join(x), axis=1)
tags_data.head(3)

Unnamed: 0,title,genres,userId,rating,timestamp,movieId,tag,document
0,Jumanji (1995),Adventure Children Fantasy,1,3.5,1112486027,0,{time travel adapted from:book board game chil...,{time travel adapted from:book board game chil...
1,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery SciFi,1,3.5,1112484676,1,{children Santa Claus dystopia abused children...,{children Santa Claus dystopia abused children...
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery SciFi Thriller,1,3.5,1112484819,2,{post-apocalyptic psychology time travel Brad ...,{post-apocalyptic psychology time travel Brad ...


In [None]:
tags_data.query('movieId == 19849')

Unnamed: 0,title,genres,userId,rating,timestamp,movieId,tag,document
19849,"Waiting Game, The (2000)",Comedy,29879,2.5,1161368462,19849,{},{} Comedy


## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
TFIDF = TfidfVectorizer(
    ngram_range=(0, 1),
    min_df=0.0001,
    stop_words='english')
TFIDF_matrix = TFIDF.fit_transform(tags_data['document'])
TFIDF_df = pd.DataFrame(TFIDF_matrix.toarray(), index=tags_data.index.tolist())
print(TFIDF_df.shape)
TFIDF_df.head(3)

(26744, 9697)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9657,9658,9659,9660,9661,9662,9663,9664,9665,9666,9667,9668,9669,9670,9671,9672,9673,9674,9675,9676,9677,9678,9679,9680,9681,9682,9683,9684,9685,9686,9687,9688,9689,9690,9691,9692,9693,9694,9695,9696
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
TFIDF.vocabulary_

{'time': 8713,
 'travel': 8855,
 'adapted': 202,
 'book': 1059,
 'board': 1020,
 'game': 3383,
 'childhood': 1553,
 'recaptured': 7141,
 'herds': 3960,
 'cgi': 1462,
 'animals': 452,
 'scary': 7624,
 'fantasy': 3003,
 'robin': 7355,
 'williams': 9452,
 'joe': 4606,
 'johnston': 4617,
 'kid': 4799,
 'flick': 3188,
 'jungle': 4680,
 'adventure': 226,
 'children': 1555,
 'dynamic': 2613,
 'action': 186,
 'kirsten': 4838,
 'dunst': 2588,
 'childish': 1554,
 'filmed': 3125,
 'bc': 815,
 'bad': 706,
 'based': 788,
 'chris': 1589,
 'van': 9128,
 'allsburg': 342,
 'magic': 5374,
 'monkey': 5883,
 'saturn': 7593,
 'award': 675,
 'best': 904,
 'special': 8145,
 'effects': 2679,
 'supporting': 8428,
 'actress': 191,
 'clv': 1712,
 'horrifying': 4116,
 'horror': 4117,
 'genre': 3470,
 'kids': 4804,
 'santa': 7567,
 'claus': 1652,
 'dystopia': 2620,
 'abused': 168,
 'surreal': 8437,
 'bleak': 984,
 'dark': 2134,
 'visually': 9238,
 'appealing': 510,
 'atmospheric': 615,
 'dreamlike': 2523,
 'fancif

In [None]:
TFIDF_df.to_pickle(path + 'TFIDF_matrix.pkl')