In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import re

In [2]:
df = pd.read_csv('movies_db.csv')
df.head()

Unnamed: 0,Actores,Aspect Ratio:,Box Office (Gross USA):,Director:,Genre:,Nombre,Original Language:,Producer:,Production Co:,Rating:,Release Date (Streaming):,Release Date (Theaters):,Runtime:,Sound Mix:,View the collection:,Writer:
0,"Natalie Portman, Jennifer Jason Leigh, Gina Ro...",Scope (2.35:1),$32.7M,Alex Garland,"Adventure, Sci Fi, Mystery And Thriller",Annihilation (2018),English,"Scott Rudin, Andrew Macdonald, Allon Reich, El...","Scott Rudin Productions, DNA Films",R (Language|Bloody Images|Some Sexuality|Viole...,"Apr 22, 2018","Feb 23, 2018 Wide",1h 55m,Dolby Digital,,Alex Garland
1,"Bob Hoskins, Christopher Lloyd, Joanna Cassidy...",,$152.8M,Robert Zemeckis,Comedy,Who Framed Roger Rabbit (1988),English,"Robert Watts, Frank Marshall",Touchstone Pictures,PG,"Dec 13, 2015","Jun 21, 1988 Wide",1h 44m,Surround,,"Jeffrey Price, Peter S. Seaman"
2,"Leonardo DiCaprio, Tom Hanks, Christopher Walk...",,$164.4M,Steven Spielberg,"Crime, Drama, Comedy",Catch Me If You Can (2002),English,"Walter F. Parkes, Steven Spielberg","Amblin Entertainment, DreamWorks SKG, Parkes/M...",PG-13 (Some Sexual Content|Brief Language),"Aug 1, 2013","Dec 25, 2002 Wide",2h 20m,Surround,,Jeff Nathanson
3,"Steve McQueen, Robert Vaughn, Jacqueline Bisse...",Flat (1.85:1),,Peter Yates,"Drama, Crime",Bullitt (1968),English,Philip D'Antoni,Solar Productions,PG,"Sep 1, 2008","Oct 17, 1968 Wide",1h 53m,Stereo,,"Alan R. Trustman, Harry Kleiner"
4,"Matthew Broderick, Jeremy Irons, James Earl Jo...",Flat (1.85:1),$422.8M,"Roger Allers, Rob Minkoff","Adventure, Kids And Family, Animation, Musical",The Lion King (1994),English,Don Hahn,Walt Disney Pictures,G,"Aug 15, 2017","Jun 24, 1994 Wide",1h 27m,"Dolby SR, DTS, Dolby Stereo, SDDS, DTS-ES, Dol...",,"Irene Mecchi, Jonathan Roberts, Linda Woolverton"


In [3]:
df.rename(columns={'Director:': 'Director', 'Producer:':'Producer', 'Production Co:': 'Production_Co',\
                   'Rating:': 'Rating', 'Writer:':'Writer', 'Genre:':'Genre'}, inplace=True)

In [4]:
df = df[['Nombre', 'Actores', 'Director']]

In [5]:
df.head()

Unnamed: 0,Nombre,Actores,Director
0,Annihilation (2018),"Natalie Portman, Jennifer Jason Leigh, Gina Ro...",Alex Garland
1,Who Framed Roger Rabbit (1988),"Bob Hoskins, Christopher Lloyd, Joanna Cassidy...",Robert Zemeckis
2,Catch Me If You Can (2002),"Leonardo DiCaprio, Tom Hanks, Christopher Walk...",Steven Spielberg
3,Bullitt (1968),"Steve McQueen, Robert Vaughn, Jacqueline Bisse...",Peter Yates
4,The Lion King (1994),"Matthew Broderick, Jeremy Irons, James Earl Jo...","Roger Allers, Rob Minkoff"


In [6]:
def preprocesar_nombre(texto):
  #convierte a minúsculas
  texto = texto.lower()

  #quita puntuaciones y todo lo que no sea letra y números
  texto = re.sub('[^A-ZÜÖÄãüáéíóúa-z]+', ' ', texto)
  texto = re.sub('[0-9]+', '', texto)
  texto= re.sub('<.*?>', '', texto)
    
  texto = texto.strip()
  
  return(texto)

In [7]:
df['Nombre'] = df['Nombre'].apply(preprocesar_nombre)
df['Actores'] = df['Actores'].apply(preprocesar_nombre)
df['Director'] = df['Director'].apply(preprocesar_nombre)

In [8]:
#df['Actores'] = df['Actores'].str.lower()
#df['Director'] = df['Director'].str.lower()
#df['Genre'] = df['Genre'].str.lower()
#df['Producer'] = df['Producer'].str.lower()
#df['Production_Co'] = df['Production_Co'].str.lower()
#df['Writer'] = df['Writer'].str.lower()

In [9]:
df.fillna('', inplace=True)

In [10]:
df['Id'] = df.index + 1
df.head()

Unnamed: 0,Nombre,Actores,Director,Id
0,annihilation,natalie portman jennifer jason leigh gina rodr...,alex garland,1
1,who framed roger rabbit,bob hoskins christopher lloyd joanna cassidy s...,robert zemeckis,2
2,catch me if you can,leonardo dicaprio tom hanks christopher walken...,steven spielberg,3
3,bullitt,steve mcqueen robert vaughn jacqueline bisset ...,peter yates,4
4,the lion king,matthew broderick jeremy irons james earl jone...,roger allers rob minkoff,5


In [11]:
df['features'] = df['Nombre'] + ', ' + df['Actores'] + ', ' + df['Director']# + ', ' + df['Genre']\
#+ ', ' + df['Producer'] + ', ' + df['Production_Co'] + ', ' + df['Writer'] 

df.head()

Unnamed: 0,Nombre,Actores,Director,Id,features
0,annihilation,natalie portman jennifer jason leigh gina rodr...,alex garland,1,"annihilation, natalie portman jennifer jason l..."
1,who framed roger rabbit,bob hoskins christopher lloyd joanna cassidy s...,robert zemeckis,2,"who framed roger rabbit, bob hoskins christoph..."
2,catch me if you can,leonardo dicaprio tom hanks christopher walken...,steven spielberg,3,"catch me if you can, leonardo dicaprio tom han..."
3,bullitt,steve mcqueen robert vaughn jacqueline bisset ...,peter yates,4,"bullitt, steve mcqueen robert vaughn jacquelin..."
4,the lion king,matthew broderick jeremy irons james earl jone...,roger allers rob minkoff,5,"the lion king, matthew broderick jeremy irons ..."


In [12]:
vectorizer = CountVectorizer()
cm = vectorizer.fit_transform(df['features'])

In [15]:
#print(vectorizer.get_feature_names())
#print(cm)

In [16]:
cs = cosine_similarity(cm)

In [17]:
cs

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.05407381, ..., 0.0571662 , 0.04914732,
        0.        ],
       [0.        , 0.05407381, 1.        , ..., 0.11128298, 0.        ,
        0.05735393],
       ...,
       [0.        , 0.0571662 , 0.11128298, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.04914732, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.05735393, ..., 0.        , 0.        ,
        1.        ]])

In [18]:
cs.shape

(100, 100)

In [25]:
#title = 'the lego movie'
title = 'who framed roger rabbit'
movie_id = df[df.Nombre == title]['Id'].values[0]

In [26]:
movie_id

2

In [27]:
scores = list(enumerate(cs[movie_id]))

In [28]:
sorted_scores = sorted(scores, key = lambda x:x[1], reverse=True)

In [29]:
sorted_scores = sorted_scores[1:]

In [30]:
contador = 0

for item in sorted_scores:
    nombre = df[df.Id == item[0]]['Nombre'].values[0]
    print(contador+1, nombre)
    contador += 1
    if contador>6:
        break

1 the lion king
2 gun crazy deadly is the female
3 up
4 coco
5 mission impossible rogue nation
6 mad max fury road
7 the hurt locker
