In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [61]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [62]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [63]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [64]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [65]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [128]:
movie_genres

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller',
 'Comedy Drama Romance',
 'Comedy Horror',
 'Adventure Animation Children',
 'Drama',
 'Action Adventure Romance',
 'Crime Drama',
 'Drama Romance',
 'Comedy',
 'Comedy',
 'Action Comedy Crime Drama Thriller',
 'Comedy Crime Thriller',
 'Crime Drama Horror Mystery Thriller',
 'Action Crime Thriller',
 'Drama SciFi',
 'Drama Romance',
 'Drama',
 'Children Drama',
 'Drama Romance',
 'Adventure Drama Fantasy Mystery SciFi',
 'Crime Drama',
 'Drama',
 'Mystery SciFi Thriller',
 'Children Drama',
 'Crime Drama',
 'Children Comedy',
 'Comedy Romance',
 'Drama',
 'Drama War',
 'Action Crime Drama',
 'Drama',
 'Action Adventure Fantasy',
 'Comedy Drama Thriller',
 'Drama Romance',
 'Mystery Thriller',
 'Animation Children Drama Musical Romance',
 'Drama R

In [67]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [82]:
# print(X_train_counts)

In [69]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [84]:
# print(X_train_tfidf)

In [72]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [73]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [74]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608]]),
 array([[6774, 9096, 5636, 6723, 3376, 7496, 9717]], dtype=int64))

In [75]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
6723,58972,Nim's Island (2008),Adventure|Comedy|Fantasy
3376,4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
7496,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy
9717,188833,The Man Who Killed Don Quixote (2018),Adventure|Comedy|Fantasy


In [93]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [94]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [95]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [112]:
movies_with_tags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3683 entries, 0 to 9732
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    3683 non-null   int64  
 1   title      3683 non-null   object 
 2   genres     3683 non-null   object 
 3   userId     3683 non-null   float64
 4   tag        3683 non-null   object 
 5   timestamp  3683 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 201.4+ KB


In [96]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [117]:
movies_with_tags.groupby('title').count()

Unnamed: 0_level_0,movieId,genres,userId,tag,timestamp
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
(500) Days of Summer (2009),8,8,8,8,8
...And Justice for All (1979),1,1,1,1,1
10 Cloverfield Lane (2016),2,2,2,2,2
10 Things I Hate About You (1999),1,1,1,1,1
101 Dalmatians (1996),2,2,2,2,2
...,...,...,...,...,...
Zero Dark Thirty (2012),5,5,5,5,5
Zombieland (2009),6,6,6,6,6
Zoolander (2001),6,6,6,6,6
Zulu (1964),1,1,1,1,1


In [97]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0


In [98]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [99]:
movies_with_tags.tag.unique().shape

(1590,)

In [100]:
movies_with_tags.dropna(inplace=True)

In [101]:
movies_with_tags.title.unique().shape

(1572,)

In [122]:
movies_with_tags[movies_with_tags.title == '(500) Days of Summer (2009)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,artistic,1279956000.0
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,Funny,1279956000.0
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,humorous,1279956000.0
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,inspiring,1279956000.0
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,intelligent,1279956000.0
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,quirky,1279956000.0
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,romance,1279956000.0
7075,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,477.0,Zooey Deschanel,1279956000.0


In [121]:
for movie, group in tqdm(movies_with_tags.groupby('title')):
    print(group.tag.values, movie)
    break

  0%|          | 0/1572 [00:00<?, ?it/s]

['artistic' 'Funny' 'humorous' 'inspiring' 'intelligent' 'quirky'
 'romance' 'Zooey Deschanel'] (500) Days of Summer (2009)


In [102]:
tag_strings = []
movies = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [127]:
movies_with_tags.groupby('title').tag

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000022249BCFC10>

In [103]:
tag_strings[:5]

['artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'lawyers',
 'creepy suspense',
 'Shakespearesortof',
 'dogs remake']

In [123]:
movies[:5]

['(500) Days of Summer (2009)',
 '...And Justice for All (1979)',
 '10 Cloverfield Lane (2016)',
 '10 Things I Hate About You (1999)',
 '101 Dalmatians (1996)']

In [104]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [105]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [106]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [107]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

822


In [108]:
tag_strings[822]

'L.A.'

In [109]:
test = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [110]:
res

(array([[0., 0., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[661, 822, 947, 955, 954, 953, 951, 950, 949, 959]], dtype=int64))

In [111]:
for i in res[1][0]:
    print(movies[i])

In a Lonely Place (1950)
Magnolia (1999)
Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evangelion Gekijô-ban: Shito shinsei) (1997)
Night and Day (1946)
Nicholas Nickleby (2002)
Niagara (1953)
Never Been Kissed (1999)
Network (1976)
Net, The (1995)
Night of the Hunter, The (1955)
