In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

Essentially 2 types of recommending systems, first, one based on content filtering, second, collaborative filtering. The project aims to achieve both if possible. 

1) In content filtering, the similarities between difffernt prodects are calculated over the base of the attributes of the products. For example, in a film recomending system based on content, the similarity is calculated based on genre, film actors, film directors, etc. 

2) Collaborative filtering, takes advantage of the power of volume. The backbone intuition is that if user A likes product Y and X, and user B likes product X, there are many possibilities that he will like Y too. Same example... lets supose we have a large number of users, that have assigned the same ratings to films X and Y. New user arrives and assigns same rating to Y, but still he has not seen X. The collaborative filtering system will recommend X to that user. It has two approaches; one based on the user and another based on the article/product. The collaborative filtering based on the article bases on the similarity between aricles. The one based on user is on the similarity of users. 

To make a recommendation system based on votes follow the link sent by Ras

Basados en la memoria: Se utilizan técnicas estadísticas para aproximar usuarios a los artículos. Correlación Pearson, Lasimilitud de Coseno, La Distancia Euclidiana. En los enfoques con modelos, es necesario crear usuarios con técnicas de machine learning como la regresión, agrupación o clasificación.

In [2]:
dfmov = pd.read_csv('../../data/imdb_movies_clean_1st.csv')

In [3]:
dfmov.isnull().sum()

Unnamed: 0            0
imdb_title_id         0
original_title        0
year                  0
genre                 0
duration              0
country               0
language              0
director              0
writer                0
production_company    0
actors                0
description           0
duration_sets         0
dtype: int64

This is an example of a content filtering recommending system specifically looking at the films names and description.

In [5]:
rec_cont = dfmov[['original_title','description']]

In [6]:
rec_cont = rec_cont[rec_cont[['original_title','description']] != 0]

In [7]:
print(rec_cont[['original_title','description']] != 0)

       original_title  description
0                True         True
1                True         True
2                True         True
3                True         True
4                True         True
...               ...          ...
85850            True         True
85851            True         True
85852            True         True
85853            True         True
85854            True         True

[85855 rows x 2 columns]


In [8]:
rec_cont

Unnamed: 0,original_title,description
0,Miss Jerry,The adventures of a female reporter in the 1890s.
1,The Story of the Kelly Gang,True story of notorious Australian outlaw Ned ...
2,Den sorte drøm,Two men of high rank are both wooing the beaut...
3,Cleopatra,The fabled queen of Egypt's affair with Roman ...
4,L'Inferno,Loosely adapted from Dante's Divine Comedy and...
...,...,...
85850,Le lion,A psychiatric hospital patient pretends to be ...
85851,De Beentjes van Sint-Hildegard,A middle-aged veterinary surgeon believes his ...
85852,Padmavyuhathile Abhimanyu,0
85853,Sokagin Çocuklari,0


## Content(Title/Description)-Based Recommender

In [9]:
take_out = (rec_cont != 0).any(axis=1)

In [10]:
rec_contn = rec_cont.loc[take_out]

In [11]:
rec_cont

Unnamed: 0,original_title,description
0,Miss Jerry,The adventures of a female reporter in the 1890s.
1,The Story of the Kelly Gang,True story of notorious Australian outlaw Ned ...
2,Den sorte drøm,Two men of high rank are both wooing the beaut...
3,Cleopatra,The fabled queen of Egypt's affair with Roman ...
4,L'Inferno,Loosely adapted from Dante's Divine Comedy and...
...,...,...
85850,Le lion,A psychiatric hospital patient pretends to be ...
85851,De Beentjes van Sint-Hildegard,A middle-aged veterinary surgeon believes his ...
85852,Padmavyuhathile Abhimanyu,0
85853,Sokagin Çocuklari,0


In [12]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [13]:
matrix = tf.fit_transform(rec_cont['description'])

In [14]:
matrix.shape
# 1920179 different types of voabularies or words in the dataset of 85855 movies

(85855, 1920179)

In [15]:
tf.get_feature_names()[6000:6010]

['1930s 50s movie',
 '1930s american',
 '1930s american scandalous',
 '1930s american socialite',
 '1930s amoral',
 '1930s amoral blonde',
 '1930s amsterdam',
 '1930s arab',
 '1930s arab states',
 '1930s area']

In [16]:
cosine_similarities = linear_kernel(matrix,matrix)

In [17]:
cosine_similarities[1]

array([0., 1., 0., ..., 0., 0., 0.])

In [18]:
movie_title = rec_cont['original_title']

In [19]:
indices = pd.Series(rec_cont.index, index=rec_cont['original_title']).drop_duplicates()

In [20]:
indices[:10]

original_title
Miss Jerry                                             0
The Story of the Kelly Gang                            1
Den sorte drøm                                         2
Cleopatra                                              3
L'Inferno                                              4
From the Manger to the Cross; or, Jesus of Nazareth    5
Madame DuBarry                                         6
Quo Vadis?                                             7
Independenta Romaniei                                  8
Richard III                                            9
dtype: int64

In [21]:
def movie_recommend(original_title, cosine_similarities=cosine_similarities):
    
    '''
    Fuction computes recommendation given a movie title and description
    '''

    idx = indices[original_title]

    sim_scores = list(enumerate(cosine_similarities[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:31]

    movie_indices = [i[0] for i in sim_scores]

    return movie_title.iloc[movie_indices]

In [24]:
movie_recommend('The Godfather').head(10)
# 10  recommended movies through the film The Godfather using original title and description columns

53975                                    Yangjamoolrihak
45869                                  Romanzo criminale
77259                                       Moving Parts
61578                                         Blood Ties
33539                                              Belly
15497    I familiari delle vittime non saranno avvertiti
43377                                       Sabita naifu
41165                                            Bookies
22349                                 Year of the Dragon
4968                                         Crime, Inc.
Name: original_title, dtype: object

In [25]:
movie_recommend('The Dark Knight Rises').head(10)

43935                 Batman Begins
58269               William Vincent
82580                  Batman Ninja
30399                Batman & Robin
81987    Batman: Gotham by Gaslight
24426                        Batman
48078               The Dark Knight
82239                         Joker
26413                Batman Returns
73755         The Lego Batman Movie
Name: original_title, dtype: object

In [26]:
movie_recommend('American Pie').head(10)

62887                Date and Switch
67690                       Blockers
46938              Another Gay Movie
21993                      Hot Moves
63249                Very Good Girls
16630    Es war nicht die Nachtigall
32869                American Virgin
25549                        Rockula
17386               Cherry Hill High
73450                 The Honor Farm
Name: original_title, dtype: object

In [66]:
movie_recommend('').head(10)

## Content (film) - Based Recommender

The recommending system can be finer with the addition of other features to our system like directors, actors, genres,...

In [28]:
dfmov.drop('Unnamed: 0', axis=1)

Unnamed: 0,imdb_title_id,original_title,year,genre,duration,country,language,director,writer,production_company,actors,description,duration_sets
0,tt0000009,Miss Jerry,1894,Romance,45,USA,0,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,0 < 1h
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia,0,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,1h < 1h30m
2,tt0001892,Den sorte drøm,1911,Drama,53,"Germany, Denmark",0,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,0 < 1h
3,tt0002101,Cleopatra,1912,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,1h30m < 2h
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,1h < 1h30m
...,...,...,...,...,...,...,...,...,...,...,...,...,...
85850,tt9908390,Le lion,2020,Comedy,95,"France, Belgium",French,Ludovic Colbeau-Justin,"Alexandre Coquelle, Matthieu Le Naour",Monkey Pack Films,"Dany Boon, Philippe Katerine, Anne Serra, Samu...",A psychiatric hospital patient pretends to be ...,1h30m < 2h
85851,tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",103,Netherlands,"German, Dutch",Johan Nijenhuis,"Radek Bajgar, Herman Finkers",Johan Nijenhuis & Co,"Herman Finkers, Johanna ter Steege, Leonie ter...",A middle-aged veterinary surgeon believes his ...,1h30m < 2h
85852,tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,130,India,Malayalam,Vineesh Aaradya,"Vineesh Aaradya, Vineesh Aaradya",RMCC Productions,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",0,2h < 2h30m
85853,tt9914286,Sokagin Çocuklari,2019,"Drama, Family",98,Turkey,Turkish,Ahmet Faik Akinci,"Ahmet Faik Akinci, Kasim Uçkan",Gizem Ajans,"Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,...",0,1h30m < 2h


In [29]:
rec_film = dfmov[['original_title', 'actors', 'director', 'writer', 'genre']]
# The dataframe used for this recommender system

In [30]:
type(rec_film)

pandas.core.frame.DataFrame

In [31]:
#rec_film.update('"' + rec_film[['actors', 'director', 'writer', 'genre']].astype(str) + '"')
#print(rec_film)

In [32]:
import re
import nltk

In [33]:
#rec_film['title'] = rec_film['original_title'].str.lower()
#rec_film['title'] = rec_film['title'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
#rec_film['title'] = rec_film['title'].apply(lambda x: re.sub('\s+', ' ', x))

In [34]:
# covert lowercase and remove punctuations, spaces...
rec_film['actors_clean'] = rec_film['actors'].str.lower()
rec_film['actors_clean'] = rec_film['actors_clean'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
rec_film['actors_clean'] = rec_film['actors_clean'].apply(lambda x: re.sub('\s+', ' ', x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['actors_clean'] = rec_film['actors'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['actors_clean'] = rec_film['actors_clean'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['actors_clean'] = rec_film['actors_clean'

In [35]:
rec_film['writers'] = rec_film['writer'].str.lower()
rec_film['writers'] = rec_film['writers'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
rec_film['writers'] = rec_film['writers'].apply(lambda x: re.sub('\s+', ' ', x))
rec_film['writers']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['writers'] = rec_film['writer'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['writers'] = rec_film['writers'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['writers'] = rec_film['writers'].apply(lambda x: re.sub(

0                             alexander black
1                                charles tait
2        urban gad gebhard sch tzler perasini
3                            victorien sardou
4                             dante alighieri
                         ...                 
85850    alexandre coquelle matthieu le naour
85851             radek bajgar herman finkers
85852         vineesh aaradya vineesh aaradya
85853           ahmet faik akinci kasim u kan
85854                     coral cruz pep puig
Name: writers, Length: 85855, dtype: object

In [36]:
rec_film['director_clean'] = rec_film['director'].str.lower()
rec_film['director_clean'] = rec_film['director_clean'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
rec_film['director_clean'] = rec_film['director_clean'].apply(lambda x: re.sub('\s+', ' ', x))
rec_film['director_clean']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['director_clean'] = rec_film['director'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['director_clean'] = rec_film['director_clean'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['director_clean'] = rec_film['dir

0                           alexander black
1                              charles tait
2                                 urban gad
3                         charles l gaskill
4        francesco bertolini adolfo padovan
                        ...                
85850                ludovic colbeau justin
85851                       johan nijenhuis
85852                       vineesh aaradya
85853                     ahmet faik akinci
85854                             laura jou
Name: director_clean, Length: 85855, dtype: object

In [37]:
rec_film['genre_clean'] = rec_film['genre'].str.lower()
rec_film['genre_clean'] = rec_film['genre_clean'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
rec_film['genre_clean'] = rec_film['genre_clean'].apply(lambda x: re.sub('\s+', ' ', x))
rec_film['genre_clean']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['genre_clean'] = rec_film['genre'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['genre_clean'] = rec_film['genre_clean'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['genre_clean'] = rec_film['genre_clean'].appl

0                        romance
1          biography crime drama
2                          drama
3                  drama history
4        adventure drama fantasy
                  ...           
85850                     comedy
85851               comedy drama
85852                      drama
85853               drama family
85854                      drama
Name: genre_clean, Length: 85855, dtype: object

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/guillermo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
#rec_film['title'] = rec_film['title'].apply(lambda x: nltk.word_tokenize(x))
#rec_film['title']

In [40]:
# tokenize
rec_film['writers'] = rec_film['writers'].apply(lambda x: nltk.word_tokenize(x))
rec_film['writers']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['writers'] = rec_film['writers'].apply(lambda x: nltk.word_tokenize(x))


0                                 [alexander, black]
1                                    [charles, tait]
2        [urban, gad, gebhard, sch, tzler, perasini]
3                                [victorien, sardou]
4                                 [dante, alighieri]
                            ...                     
85850     [alexandre, coquelle, matthieu, le, naour]
85851               [radek, bajgar, herman, finkers]
85852           [vineesh, aaradya, vineesh, aaradya]
85853           [ahmet, faik, akinci, kasim, u, kan]
85854                       [coral, cruz, pep, puig]
Name: writers, Length: 85855, dtype: object

In [41]:
rec_film['actors_clean'] = rec_film['actors_clean'].apply(lambda x: nltk.word_tokenize(x))
rec_film['actors_clean']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['actors_clean'] = rec_film['actors_clean'].apply(lambda x: nltk.word_tokenize(x))


0        [blanche, bayliss, william, courtenay, chaunce...
1        [elizabeth, tait, john, tait, norman, campbell...
2        [asta, nielsen, valdemar, psilander, gunnar, h...
3        [helen, gardner, pearl, sindelar, miss, fieldi...
4        [salvatore, papa, arturo, pirovano, giuseppe, ...
                               ...                        
85850    [dany, boon, philippe, katerine, anne, serra, ...
85851    [herman, finkers, johanna, ter, steege, leonie...
85852    [anoop, chandran, indrans, sona, nair, simon, ...
85853    [ahmet, faik, akinci, belma, mamati, metin, ke...
85854    [maria, morera, colomer, biel, rossell, pelfor...
Name: actors_clean, Length: 85855, dtype: object

In [42]:
rec_film['director_clean'] = rec_film['director_clean'].apply(lambda x: nltk.word_tokenize(x))
rec_film['director_clean']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['director_clean'] = rec_film['director_clean'].apply(lambda x: nltk.word_tokenize(x))


0                             [alexander, black]
1                                [charles, tait]
2                                   [urban, gad]
3                          [charles, l, gaskill]
4        [francesco, bertolini, adolfo, padovan]
                          ...                   
85850                 [ludovic, colbeau, justin]
85851                         [johan, nijenhuis]
85852                         [vineesh, aaradya]
85853                      [ahmet, faik, akinci]
85854                               [laura, jou]
Name: director_clean, Length: 85855, dtype: object

In [43]:
rec_film['genre_clean'] = rec_film['genre_clean'].apply(lambda x: nltk.word_tokenize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['genre_clean'] = rec_film['genre_clean'].apply(lambda x: nltk.word_tokenize(x))


In [44]:
from nltk.corpus import stopwords

In [45]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/guillermo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
#stop_words = nltk.corpus.stopwords.words('english')
#plot = []
#for sentence in rec_film['title']:
 #   temp = []
  #  for word in sentence:
   #     if word not in stop_words or len(word) >= 3:
    #        temp.append(word)
   # plot.append(temp)
    
#plot

In [47]:
#rec_film['title'] = plot

In [48]:
# remove stop words
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in rec_film['actors_clean']:
    temp = []
    for word in sentence:
        if word not in stop_words or len(word) >= 3:
            temp.append(word)
    plot.append(temp)
    
plot

[['blanche', 'bayliss', 'william', 'courtenay', 'chauncey', 'depew'],
 ['elizabeth',
  'tait',
  'john',
  'tait',
  'norman',
  'campbell',
  'bella',
  'cola',
  'will',
  'coyne',
  'sam',
  'crewes',
  'jack',
  'ennis',
  'john',
  'forde',
  'vera',
  'linden',
  'mr',
  'marshall',
  'mr',
  'mckenzie',
  'frank',
  'mills',
  'ollie',
  'wilson'],
 ['asta',
  'nielsen',
  'valdemar',
  'psilander',
  'gunnar',
  'helsengreen',
  'emil',
  'albes',
  'hugo',
  'flink',
  'mary',
  'hagen'],
 ['helen',
  'gardner',
  'pearl',
  'sindelar',
  'miss',
  'fielding',
  'miss',
  'robson',
  'helene',
  'costello',
  'charles',
  'sindelar',
  'mr',
  'howard',
  'james',
  'r',
  'waite',
  'mr',
  'osborne',
  'harry',
  'knowles',
  'mr',
  'paul',
  'mr',
  'brady',
  'mr',
  'corker'],
 ['salvatore',
  'papa',
  'arturo',
  'pirovano',
  'giuseppe',
  'de',
  'liguoro',
  'pier',
  'delle',
  'vigne',
  'augusto',
  'milla',
  'attilio',
  'motta',
  'emilise',
  'beretta'],
 ['r

In [49]:
rec_film['actors_clean'] = plot

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['actors_clean'] = plot


In [50]:
rec_film['actors_clean']

0        [blanche, bayliss, william, courtenay, chaunce...
1        [elizabeth, tait, john, tait, norman, campbell...
2        [asta, nielsen, valdemar, psilander, gunnar, h...
3        [helen, gardner, pearl, sindelar, miss, fieldi...
4        [salvatore, papa, arturo, pirovano, giuseppe, ...
                               ...                        
85850    [dany, boon, philippe, katerine, anne, serra, ...
85851    [herman, finkers, johanna, ter, steege, leonie...
85852    [anoop, chandran, indrans, sona, nair, simon, ...
85853    [ahmet, faik, akinci, belma, mamati, metin, ke...
85854    [maria, morera, colomer, biel, rossell, pelfor...
Name: actors_clean, Length: 85855, dtype: object

In [51]:
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in rec_film['writers']:
    temp = []
    for word in sentence:
        if word not in stop_words or len(word) >= 3:
            temp.append(word)
    plot.append(temp)
    
plot

[['alexander', 'black'],
 ['charles', 'tait'],
 ['urban', 'gad', 'gebhard', 'sch', 'tzler', 'perasini'],
 ['victorien', 'sardou'],
 ['dante', 'alighieri'],
 ['gene', 'gauntier'],
 ['norbert', 'falk', 'hanns', 'kr', 'ly'],
 ['henryk', 'sienkiewicz', 'enrico', 'guazzoni'],
 ['aristide', 'demetriade', 'petre', 'liciu'],
 ['james', 'keane', 'william', 'shakespeare'],
 ['axel', 'garde', 'gerhart', 'hauptmann'],
 ['marcel', 'allain', 'louis', 'feuillade'],
 ['nils', 'krok'],
 ['marcel', 'allain', 'louis', 'feuillade'],
 ['emiliano', 'bonetti', 'g', 'monleone'],
 ['alfred', 'machin'],
 ['marcel', 'allain', 'louis', 'feuillade'],
 ['w', 'griffith', 'h', 'e', 'aitken'],
 ['hanns', 'heinz', 'ewers', 'hanns', 'heinz', 'ewers'],
 [],
 ['edward', 'george', 'bulwer', 'lytton', 'mario', 'caserini'],
 ['salvatore', 'di', 'giacomo', 'francesca', 'bertini'],
 ['edgar', 'allan', 'poe', 'w', 'griffith'],
 ['william', 'h', 'clifford', 'thomas', 'h', 'ince'],
 ['gabriele', 'annunzio', 'titus', 'livius'],
 [

In [52]:
rec_film['writers'] = plot

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['writers'] = plot


In [53]:
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in rec_film['director_clean']:
    temp = []
    for word in sentence:
        if word not in stop_words or len(word) >= 3:
            temp.append(word)
    plot.append(temp)
    

In [54]:
rec_film['director_clean'] = plot

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['director_clean'] = plot


In [55]:
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in rec_film['genre_clean']:
    temp = []
    for word in sentence:
        if word not in stop_words or len(word) >= 3:
            temp.append(word)
    plot.append(temp)
    

In [56]:
rec_film['genre_clean'] = plot

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['genre_clean'] = plot


In [57]:
# Combining all columns data
columns = ['actors_clean', 'director_clean', 'writers', 'genre_clean']
l = []
for i in range(len(rec_film)):
    words = ''
    for col in columns:
        words += ' '.join(rec_film[col][i]) + ' '
    l.append(words)
l

['blanche bayliss william courtenay chauncey depew alexander black alexander black romance ',
 'elizabeth tait john tait norman campbell bella cola will coyne sam crewes jack ennis john forde vera linden mr marshall mr mckenzie frank mills ollie wilson charles tait charles tait biography crime drama ',
 'asta nielsen valdemar psilander gunnar helsengreen emil albes hugo flink mary hagen urban gad urban gad gebhard sch tzler perasini drama ',
 'helen gardner pearl sindelar miss fielding miss robson helene costello charles sindelar mr howard james r waite mr osborne harry knowles mr paul mr brady mr corker charles l gaskill victorien sardou drama history ',
 'salvatore papa arturo pirovano giuseppe de liguoro pier delle vigne augusto milla attilio motta emilise beretta francesco bertolini adolfo padovan dante alighieri adventure drama fantasy ',
 'r henderson bland percy dyer gene gauntier alice hollister samuel morgan james ainsley robert g vignola george kellog j p mcgowan sidney olcot

In [58]:
rec_film['clean_input'] = l
rec_film = rec_film[['original_title', 'clean_input']]
rec_film.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_film['clean_input'] = l


Unnamed: 0,original_title,clean_input
0,Miss Jerry,blanche bayliss william courtenay chauncey dep...
1,The Story of the Kelly Gang,elizabeth tait john tait norman campbell bella...
2,Den sorte drøm,asta nielsen valdemar psilander gunnar helseng...
3,Cleopatra,helen gardner pearl sindelar miss fielding mis...
4,L'Inferno,salvatore papa arturo pirovano giuseppe de lig...


In [59]:
rec_film

Unnamed: 0,original_title,clean_input
0,Miss Jerry,blanche bayliss william courtenay chauncey dep...
1,The Story of the Kelly Gang,elizabeth tait john tait norman campbell bella...
2,Den sorte drøm,asta nielsen valdemar psilander gunnar helseng...
3,Cleopatra,helen gardner pearl sindelar miss fielding mis...
4,L'Inferno,salvatore papa arturo pirovano giuseppe de lig...
...,...,...
85850,Le lion,dany boon philippe katerine anne serra samuel ...
85851,De Beentjes van Sint-Hildegard,herman finkers johanna ter steege leonie ter b...
85852,Padmavyuhathile Abhimanyu,anoop chandran indrans sona nair simon britto ...
85853,Sokagin Çocuklari,ahmet faik akinci belma mamati metin ke eci bu...


In [60]:
rec_film.to_csv('../../data/clean_title_input_movies.csv')

## Feature Extraction

The major difference between CountVectorizer() and TF-IDF is the inverse document frequency (IDF) component which is present in later and not in the former.

One key difference is that you use the CountVectorizer() instead of TF-IDF. This is because you do not want to down-weight the actor/director's presence if he or she has acted or directed in relatively more movies. It doesn't make much intuitive sense to down-weight them in this context

### TF-IDF

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
tfid = TfidfVectorizer()
features = tfid.fit_transform(rec_film['clean_input'])

In [63]:
# create cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(features, features)
print(cosine_sim)

MemoryError: Unable to allocate 54.9 GiB for an array with shape (85855, 85855) and data type float64

### CountVectorizer

In [63]:
#count = CountVectorizer(stop_words='english')
#c_matrix = count.fit_transform(rec_film['clean_input'])

In [65]:
#cosine_sim2 = cosine_similarity(c_matrix, c_matrix)
#print(cosine_sim2)

## Movie Recommendation

In [50]:
index = pd.Series(rec_film['original_title'])
index.head()

0                     Miss Jerry
1    The Story of the Kelly Gang
2                 Den sorte drøm
3                      Cleopatra
4                      L'Inferno
Name: original_title, dtype: object

In [57]:
def recommend_movies(title):
    '''
    Based on the index we are getting similarity matrix representation and calculation
    '''
    movies = []
    idx = index[index == title].index[0]
    #print(idx)
    score = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    # The higher the score the better
    top10 = list(score.iloc[1:11].index)
    #print(top10)
    
    for i in top10:
        movies.append(rec_film['original_title'][i])
    return movies

In [58]:
recommend_movies('Cleopatra')

['The Story of the Kelly Gang',
 'My Ain Folk',
 'Dark Floors',
 'Pathfinders: In the Company of Strangers',
 'Mighty Joe Young',
 'Narco Valley',
 'Riot',
 'D.C. Cab',
 'Karate baka ichidai',
 'Another Forever']

In [59]:
index[index == 'Cleopatra'].index[0]

3

In [60]:
pd.Series(cosine_sim[3]).sort_values(ascending=False)
# We ignore the first one as its the film doing the reccomend of

3        1.000000
1        0.246818
16161    0.208614
52059    0.183434
56142    0.178100
           ...   
31644    0.000000
62780    0.000000
62777    0.000000
62775    0.000000
0        0.000000
Length: 85855, dtype: float64

In [61]:
recommend_movies('Avatar')

['Aliens',
 'Beyond Glory',
 'Vantage Point',
 'Like Father Like Son',
 'Go Tell It on the Mountain',
 'Some Girl',
 'Cosmos',
 'Fast & Furious',
 'Space Mutiny',
 'Man on a Ledge']

In [76]:
import pickle

In [78]:
rec_film.to_pickle('../../data/input_clean_pic.pkl')

In [None]:
recommend_movies('')