# empezaremos por hacer eval(), explode(), json_normalize y merge a los datasets credits y movies

In [1]:
import pandas as pd
credits = pd.read_csv('credits.csv', low_memory=False)
movies = pd.read_csv('movies_dataset.csv', low_memory=False)
print(credits.shape)
print(movies.shape)

(45476, 3)
(45466, 24)


### Ambas columnas tienen en comun el id de la pelicula.
### Se eliminaran los nulos y duplicados en cada una, y luego se uniran las tablas conservando los valores que esten en comun

### Primero se hara con credits.csv

In [2]:
# revisando cuantas filas estan COMPLETAMENTE duplicadas
credits[credits.duplicated()==True].shape

(37, 3)

In [3]:
# eliminando filas COMPLETAMENTE DUPLICADAS
credits.drop_duplicates(inplace=True)
# Revisando la eliminacion
credits[credits.duplicated()==True].shape

(0, 3)

In [4]:
# Revisando duplicacion de id donde algun otro atributo es diferente
print(credits[credits.duplicated(subset=['id'], keep=False)==True].shape)

credits[credits.duplicated(subset=['id'], keep=False)==True].sort_values(by='id')
# se observa a primera vista que la diferencia se encuentra en la columna crew, al menos en el valor de la llave credit_id

(14, 3)


Unnamed: 0,cast,crew,id
5865,"[{'cast_id': 15, 'character': 'Chuck Barris', ...","[{'credit_id': '52fe43e2c3a36847f80760b5', 'de...",4912
33838,"[{'cast_id': 15, 'character': 'Chuck Barris', ...","[{'credit_id': '52fe43e2c3a36847f80760a9', 'de...",4912
26638,"[{'cast_id': 1, 'character': 'Gerard Carriere'...","[{'credit_id': '52fe47bfc3a368484e0d77bf', 'de...",69234
9576,"[{'cast_id': 1, 'character': 'Gerard Carriere'...","[{'credit_id': '5468acec22136e68c9000d53', 'de...",69234
38882,"[{'cast_id': 1004, 'character': 'Luke Oarum', ...","[{'credit_id': '52fe4a269251416c750df61d', 'de...",99080
16167,"[{'cast_id': 1004, 'character': 'Luke Oarum', ...","[{'credit_id': '52fe4a269251416c750df623', 'de...",99080
23533,"[{'cast_id': 3, 'character': 'Camille Claudel'...","[{'credit_id': '577ed5389251416976004432', 'de...",110428
4356,"[{'cast_id': 3, 'character': 'Camille Claudel'...","[{'credit_id': '52fe4ad6c3a36847f81e461b', 'de...",110428
30013,"[{'cast_id': 4, 'character': 'Mihoko Nakagawa'...","[{'credit_id': '56365ed9925141285701b06e', 'de...",132641
838,"[{'cast_id': 4, 'character': 'Mihoko Nakagawa'...","[{'credit_id': '52fe4b9ac3a368484e190d25', 'de...",132641


In [5]:
# son pocas peliculas, asi que para simplificar, eliminamos todos estos registros

credits.drop_duplicates(inplace=True,subset=['id'],keep=False)
credits.shape

(45425, 3)

In [6]:
type(credits['cast'][0])
# los valores en cast inicialmente son strings

str

In [7]:
type(credits['crew'][0])
# los valores en crew inicialmente son strings

str

* Para el modelo, de credits usaremos el cast (los 5 actores mas protagonicos) y el director.
* Por las limitaciones de las herramientas, se sabe que se hara una reduccion significativa del dataset.
* Por ende, si cualquiera de estos campos (cast or crew) es nulo o vacio, se eliminara del dataset.

In [8]:
# Vemos si hay listas vacias en cast. Aprovechamos que es un string para buscar de esta forma
credits[credits['cast']== '[]']

Unnamed: 0,cast,crew,id
137,[],"[{'credit_id': '52fe4ab0c3a368484e161d3d', 'de...",124639
240,[],"[{'credit_id': '52fe464ac3a36847f80f6d61', 'de...",43475
393,[],"[{'credit_id': '52fe4624c3a36847f80ef0a5', 'de...",42981
438,[],"[{'credit_id': '52fe448dc3a368484e029383', 'de...",24257
595,[],"[{'credit_id': '52fe4aacc3a368484e16115b', 'de...",124472
...,...,...,...
45447,[],"[{'credit_id': '5981a15c92514151e0011b51', 'de...",455661
45452,[],"[{'credit_id': '52fe4684c3a36847f81034f3', 'de...",44330
45458,[],"[{'credit_id': '52fe4a74c3a368484e1542e9', 'de...",122036
45462,[],"[{'credit_id': '539ef1090e0a263dd00000d7', 'de...",276895


In [9]:
# nos quedamos solo con aquellos que no son listas vacias
credits=credits[credits['cast']!= '[]']
credits.shape

(43011, 3)

In [10]:
# Hacemos lo mismo con crew
credits[credits['crew']== '[]']

Unnamed: 0,cast,crew,id
189,"[{'cast_id': 4, 'character': 'Himself', 'credi...",[],56088
614,"[{'cast_id': 1, 'character': 'Grace Rhodes', '...",[],123505
635,"[{'cast_id': 0, 'character': 'Joachim Krippo',...",[],339428
863,"[{'cast_id': 1, 'character': 'Ana Alonzo', 'cr...",[],253632
1107,"[{'cast_id': 1, 'character': 'Narrator (voice)...",[],79306
...,...,...,...
45086,"[{'cast_id': 0, 'character': ""Richard 'RJ' Hug...",[],298540
45131,"[{'cast_id': 1, 'character': 'Anne Kennedy', '...",[],21893
45261,"[{'cast_id': 1, 'character': 'Herr Karl', 'cre...",[],28469
45277,"[{'cast_id': 0, 'character': 'Herself', 'credi...",[],458618


In [11]:
credits=credits[credits['crew']!= '[]']
credits.shape

(42661, 3)

## Primera fase de limpieza de credits lista
## Ahora vamos con movies

* De movies nos quedaremos con la columna id, para usarla como union con credits, y aquellas columnas con informacion para el modelo
* las columnas que necesitamos son aquellas que puedan usarse para describir a las peliculas, por ellos las que conservaremos son: belongs_to_collection, genres, overview, production companies, tagline y title

In [12]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

# Reduccion hecha para facilitar el modelo
* El modelo seleccionado (similitud del coseno) evalua vectores creados a partir de la similitud de palabras, y algunos campos pueden tener palabras en otro idioma afectando negativemente el rendimiento del modelo, por ende nos quedaremos solo con las peliculas en idioma ingles


In [13]:
# tomamos el primer valor y vemos el tipo de dato
type(movies['original_language'][0])

str

In [14]:
movies['original_language'][:5]
# para el idioma ingles, el codigo es "en", el cual sera el que conservaremos

0    en
1    en
2    en
3    en
4    en
Name: original_language, dtype: object

In [15]:
movies = movies[movies['original_language']== 'en']

* Ahora procedemos a seleccionar las columnas de interes para el modelo

In [16]:
movies = movies[['belongs_to_collection', 'genres', 'id', 'overview','production_companies', 'tagline', 'title', ]]
movies.head(3)

Unnamed: 0,belongs_to_collection,genres,id,overview,production_companies,tagline,title
0,"{'id': 10194, 'name': 'Toy Story Collection', ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]",,Toy Story
1,,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",Roll the dice and unleash the excitement!,Jumanji
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men


In [17]:
movies.shape

(32269, 7)

In [18]:
# buscando registros COMPLETAMENTE repetidos
movies[movies.duplicated()==True].shape

(18, 7)

In [19]:
# eliminando filas COMPLETAMENTE DUPLICADAS
movies.drop_duplicates(inplace=True)
# Revisando la eliminacion
movies.shape


(32251, 7)

In [20]:
# Revisando duplicacion de id donde algun otro atributo es diferente
print(movies[movies.duplicated(subset=['id'], keep=False)==True].shape)

(0, 7)


In [21]:
movies[movies['id'].isnull()].shape
# no hay valores nulos en id

(0, 7)

In [22]:
# buscando valores nulos en title
movies[movies['title'].isnull()].shape

(2, 7)

In [23]:
# como necesitamos input y output de titulos de peliculas, un valor nulo no aporta al modelo.
# se eliminaran los valores nulos en title para simplificar
movies.dropna(subset='title', inplace=True)
movies.shape

(32249, 7)

## Combinaremos movies y credits, donde solo conservaremos los id que coincidan

In [24]:
print(movies.shape)
print(credits.shape)

(32249, 7)
(42661, 3)


In [25]:
credits['id'].dtype
# comprobando que id en credits es un integer para posterior merge con movies

dtype('int64')

In [26]:
movies['id'].dtype

dtype('O')

In [27]:
# cambiamos el tipo de dato de "id" en movies a integer para poder combinar
movies['id']=movies['id'].astype(int)
movies['id'].dtype

dtype('int64')

In [28]:
# ahora si podemos hacer el merge
modelo_mc_merge = pd.merge(movies,credits,how='inner',on='id')
modelo_mc_merge.shape

(30025, 9)

In [29]:
modelo_mc_merge.head(3)

Unnamed: 0,belongs_to_collection,genres,id,overview,production_companies,tagline,title,cast,crew
0,"{'id': 10194, 'name': 'Toy Story Collection', ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]",,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",Roll the dice and unleash the excitement!,Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


## Ahora a evaluar la tabla unida

In [30]:
modelo_mc_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30025 entries, 0 to 30024
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   belongs_to_collection  3064 non-null   object
 1   genres                 30025 non-null  object
 2   id                     30025 non-null  int64 
 3   overview               29967 non-null  object
 4   production_companies   30025 non-null  object
 5   tagline                17481 non-null  object
 6   title                  30025 non-null  object
 7   cast                   30025 non-null  object
 8   crew                   30025 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB


## Sabemos que necesitamos un dataset reducido para trabajar en Render y "belongs_to_collection" se ve como una opcion de filtro para trabajar:
- Contiene una aceptable cantidad de datos (3064)
- Un atributo que ayuda bastante a la hora de asociar peliculas similares, lo cual resulta muy util para que el modelo de recomendacion sea acertado.

In [31]:
# creamos un dataset donde solo tomamos los registros donde "belongs_to_collection" no sean nulos
modelo_lite = modelo_mc_merge[modelo_mc_merge['belongs_to_collection'].notnull()].copy()

modelo_lite.head(3)

Unnamed: 0,belongs_to_collection,genres,id,overview,production_companies,tagline,title,cast,crew
0,"{'id': 10194, 'name': 'Toy Story Collection', ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]",,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
4,"{'id': 96871, 'name': 'Father of the Bride Col...","[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,"[{'name': 'Sandollar Productions', 'id': 5842}...",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [32]:
# se deben resetear los index, ya que mas adelante sera necesario para la extraccion correcta de los valores de recomendacion
modelo_lite = modelo_lite.reset_index(drop=True)
modelo_lite.head(3)

Unnamed: 0,belongs_to_collection,genres,id,overview,production_companies,tagline,title,cast,crew
0,"{'id': 10194, 'name': 'Toy Story Collection', ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]",,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,"{'id': 119050, 'name': 'Grumpy Old Men Collect...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
2,"{'id': 96871, 'name': 'Father of the Bride Col...","[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,"[{'name': 'Sandollar Productions', 'id': 5842}...",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


## Trasformaciones para generar el modelo

In [33]:
modelo_lite.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3064 entries, 0 to 3063
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   belongs_to_collection  3064 non-null   object
 1   genres                 3064 non-null   object
 2   id                     3064 non-null   int64 
 3   overview               3064 non-null   object
 4   production_companies   3064 non-null   object
 5   tagline                2217 non-null   object
 6   title                  3064 non-null   object
 7   cast                   3064 non-null   object
 8   crew                   3064 non-null   object
dtypes: int64(1), object(8)
memory usage: 215.6+ KB


## Lo que se hará ahora es trabajar la info de cada columna y conservar solo lo que necesitamos

In [34]:
print(type(modelo_lite['belongs_to_collection'][1]))
modelo_lite['belongs_to_collection'][1]

# esta columna es un string
 

<class 'str'>


"{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg'}"

* De aca necesitamos solo el nombre, por lo que se aplicara el siguiente codigo

In [35]:
# con la funcion eval() podemos convertir el diccionario que esta como texto en diccionario python
# luego extraemos el valor de name que es lo que usaremos para el modelo
def coleccion(diccionario):
    diccionario = eval(diccionario)
    return diccionario['name'] 

modelo_lite['belongs_to_collection'] = modelo_lite['belongs_to_collection'].apply(coleccion)
modelo_lite['belongs_to_collection'][:5]

0              Toy Story Collection
1         Grumpy Old Men Collection
2    Father of the Bride Collection
3             James Bond Collection
4                  Balto Collection
Name: belongs_to_collection, dtype: object

In [36]:
print(type(modelo_lite['genres'][0]))
print(modelo_lite['genres'][0])
print(type(modelo_lite['production_companies'][0]))
print(modelo_lite['production_companies'][0])


<class 'str'>
[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
<class 'str'>
[{'name': 'Pixar Animation Studios', 'id': 3}]


* De aca igualmente tomaremos solo los "name" y los escribiremos en una lista

In [37]:
# en este caso eval() genera una lista python, sobre la cual debemos iterar por cada elemento (diccionario) para
# extraer los nombres necesarios 
def extract_name(lista):
    return [d['name'] for d in eval(lista)]  # codigo escrito como list_comprehension para simplicidad

modelo_lite['genres'] = modelo_lite['genres'].apply(extract_name)
modelo_lite['production_companies'] = modelo_lite['production_companies'].apply(extract_name)

print(modelo_lite['genres'][:5])
print(modelo_lite['production_companies'][:5])

0       [Animation, Comedy, Family]
1                 [Romance, Comedy]
2                          [Comedy]
3     [Adventure, Action, Thriller]
4    [Family, Animation, Adventure]
Name: genres, dtype: object
0                            [Pixar Animation Studios]
1                       [Warner Bros., Lancaster Gate]
2         [Sandollar Productions, Touchstone Pictures]
3                    [United Artists, Eon Productions]
4    [Universal Pictures, Amblin Entertainment, Amb...
Name: production_companies, dtype: object


## Para el caso de cast, la lista esta ordernada segun el protagonismo. Tomaremos los primeros 5 valores que son los actores mas protagonicos de cada pelicula

In [38]:
modelo_lite['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [39]:
# similar a extract_name()
def casting(lista):
    return [d['name'] for d in eval(lista)[:5]] # eval(lista) es una lista, y el [:5] hace un slicing de los primeros 5

modelo_lite['cast'] = modelo_lite['cast'].apply(casting)

In [40]:
# probando que tenemos solo 5
modelo_lite['cast'][0]

['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim Varney', 'Wallace Shawn']

* Para crew, usaremos solo los directores

In [41]:
modelo_lite['crew'][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [42]:
def director(lista):
    return [d['name'] for d in eval(lista) if d['job']=='Director'] 
    # Por cada diccionario en la lista, si job = Director, entonces regresa el nombre

modelo_lite['crew'] = modelo_lite['crew'].apply(director)
modelo_lite['crew'][:5]

0      [John Lasseter]
1      [Howard Deutch]
2      [Charles Shyer]
3    [Martin Campbell]
4        [Simon Wells]
Name: crew, dtype: object

In [43]:
modelo_lite.head(3)

Unnamed: 0,belongs_to_collection,genres,id,overview,production_companies,tagline,title,cast,crew
0,Toy Story Collection,"[Animation, Comedy, Family]",862,"Led by Woody, Andy's toys live happily in his ...",[Pixar Animation Studios],,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter]
1,Grumpy Old Men Collection,"[Romance, Comedy]",15602,A family wedding reignites the ancient feud be...,"[Warner Bros., Lancaster Gate]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch]
2,Father of the Bride Collection,[Comedy],11862,Just when George Banks has recovered from his ...,"[Sandollar Productions, Touchstone Pictures]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer]


# Lista la limpieza, ahora se comienza las transformaciones para generar el modelo

* Para las columnas belongs_to_collection, overview y tagline, haremos una lista con cada palabra como item 

In [44]:
modelo_lite[modelo_lite['tagline'].isna()].shape
# aqui se ve que los valores nulos en tagline estan guardados como float

(847, 9)

In [45]:
def spliter(texto):
    if type(texto) != str:
        texto = ''  # los nulos son typo nan (nulo float), esto genera un error, que se resolvio
                    # sustituyendo el nulo float por un string vacio
    return texto.split()

modelo_lite['belongs_to_collection'] = modelo_lite['belongs_to_collection'].apply(spliter)
modelo_lite['overview'] = modelo_lite['overview'].apply(spliter)
modelo_lite['tagline'] = modelo_lite['tagline'].apply(spliter)
modelo_lite['model_title'] = modelo_lite['title'].apply(spliter) # se creo una columna aparte del titulo para el modelo

modelo_lite.head(3)

Unnamed: 0,belongs_to_collection,genres,id,overview,production_companies,tagline,title,cast,crew,model_title
0,"[Toy, Story, Collection]","[Animation, Comedy, Family]",862,"[Led, by, Woody,, Andy's, toys, live, happily,...",[Pixar Animation Studios],[],Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[Toy, Story]"
1,"[Grumpy, Old, Men, Collection]","[Romance, Comedy]",15602,"[A, family, wedding, reignites, the, ancient, ...","[Warner Bros., Lancaster Gate]","[Still, Yelling., Still, Fighting., Still, Rea...",Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch],"[Grumpier, Old, Men]"
2,"[Father, of, the, Bride, Collection]",[Comedy],11862,"[Just, when, George, Banks, has, recovered, fr...","[Sandollar Productions, Touchstone Pictures]","[Just, When, His, World, Is, Back, To, Normal....",Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer],"[Father, of, the, Bride, Part, II]"


### Ahora, para aquellos casos donde hayan espacios (como nombres propios), se eliminaran los espacios para hacer palabras compuestas.
* No se hizo con los titulos para mejorar las recomendaciones del modelo

In [46]:
def espacios_lista(lista):
    return [word.replace(" ","") for word in lista]

modelo_lite['genres'] = modelo_lite['genres'].apply(espacios_lista)
modelo_lite['production_companies'] = modelo_lite['production_companies'].apply(espacios_lista)
modelo_lite['cast'] = modelo_lite['cast'].apply(espacios_lista)
modelo_lite['crew'] = modelo_lite['crew'].apply(espacios_lista)

modelo_lite.head(3)


Unnamed: 0,belongs_to_collection,genres,id,overview,production_companies,tagline,title,cast,crew,model_title
0,"[Toy, Story, Collection]","[Animation, Comedy, Family]",862,"[Led, by, Woody,, Andy's, toys, live, happily,...",[PixarAnimationStudios],[],Toy Story,"[TomHanks, TimAllen, DonRickles, JimVarney, Wa...",[JohnLasseter],"[Toy, Story]"
1,"[Grumpy, Old, Men, Collection]","[Romance, Comedy]",15602,"[A, family, wedding, reignites, the, ancient, ...","[WarnerBros., LancasterGate]","[Still, Yelling., Still, Fighting., Still, Rea...",Grumpier Old Men,"[WalterMatthau, JackLemmon, Ann-Margret, Sophi...",[HowardDeutch],"[Grumpier, Old, Men]"
2,"[Father, of, the, Bride, Collection]",[Comedy],11862,"[Just, when, George, Banks, has, recovered, fr...","[SandollarProductions, TouchstonePictures]","[Just, When, His, World, Is, Back, To, Normal....",Father of the Bride Part II,"[SteveMartin, DianeKeaton, MartinShort, Kimber...",[CharlesShyer],"[Father, of, the, Bride, Part, II]"


* Creamos nueva columna con todas las palabras en las listas llamada tags

In [47]:
# todos las listas de palabras en una sola gran lista llamada tags
modelo_lite['tags'] = modelo_lite['belongs_to_collection'] + modelo_lite['genres'] + modelo_lite['overview'] + modelo_lite['production_companies'] + modelo_lite['tagline'] + modelo_lite['cast'] + modelo_lite['crew'] + modelo_lite['model_title']

modelo_lite.columns

Index(['belongs_to_collection', 'genres', 'id', 'overview',
       'production_companies', 'tagline', 'title', 'cast', 'crew',
       'model_title', 'tags'],
      dtype='object')

* Creamos un nuevo data set con title y tags

In [48]:
dataset_modelo = modelo_lite[['title','tags']].copy()
dataset_modelo.head(3)

Unnamed: 0,title,tags
0,Toy Story,"[Toy, Story, Collection, Animation, Comedy, Fa..."
1,Grumpier Old Men,"[Grumpy, Old, Men, Collection, Romance, Comedy..."
2,Father of the Bride Part II,"[Father, of, the, Bride, Collection, Comedy, J..."


* Se convirtió la lista en tags como un solo texto con todas las palabras

In [49]:
def str_convert(lista):
    return ' '.join(lista)

dataset_modelo['tags'] = dataset_modelo['tags'].apply(str_convert)

dataset_modelo.head(3)

Unnamed: 0,title,tags
0,Toy Story,Toy Story Collection Animation Comedy Family L...
1,Grumpier Old Men,Grumpy Old Men Collection Romance Comedy A fam...
2,Father of the Bride Part II,Father of the Bride Collection Comedy Just whe...


# Modelo de recomendacion

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1250, stop_words='english')

# max-features para limitar la cantidad de palabras, y stop-words para eliminar palabras irrelevantes
# el por que del valor 1250 se explica mas adelante

In [51]:
# la transformacion hace que tengamos tantas columnas como palabras diferentes y ver cuales aparecen
# aqui la cantidad de palabras diferentes (columnas) son 1250 pq fue el limite puesto al vectorizador

vector = cv.fit_transform(dataset_modelo['tags']).toarray()
vector.shape

(3064, 1250)

### Render nos da un maximo de 512 Mb de memoria, por lo que se hizo ensayo y error al correr el cosine_similarity cambiando el max_features en el vectorizador.
* El maximo valor de max_features que no generaba uso de memoria mayor a 512 Mb fue 1250

In [52]:
from sklearn.metrics.pairwise import cosine_similarity
import tracemalloc

tracemalloc.start()
similitud = cosine_similarity(vector)

tracemalloc.get_traced_memory()
print(f'{round((tracemalloc.get_traced_memory()[1]-tracemalloc.get_traced_memory()[0])/125000, ndigits=2)} Mb')
tracemalloc.stop()

490.25 Mb


In [53]:
similitud.shape

(3064, 3064)

In [54]:
def recomendacion(titulo):

    if titulo not in dataset_modelo['title'].tolist():
        print('Hay algun error en el nombre introducido (considere mayúsculas) o la película no se encuentra en la base de datos')
    
    # se busca el indice correspondiente a la pelicula que se ingreso
    indice = dataset_modelo[dataset_modelo['title']== titulo].index[0]

    # con el indice se extrae la fila de la pelicula y toda su semejanza con el resto de peliculas, ordenando de mayor a menor
    distancia = sorted(list(enumerate(similitud[indice])), reverse=True, key=lambda x: x[1])

    # se extrae del 2do al 6 valor (el primero es la pelicula ingresada)
    for i in distancia[1:6]:
        print(dataset_modelo.iloc[i[0]].title)

recomendacion('Toy Story')

Toy Story 2
Toy Story 3
Thumbtanic
The Rutles: All You Need Is Cash
A Cinderella Story: Once Upon a Song


### Algunos ejemplos de recomendaciones

In [55]:
recomendacion('The Land Before Time')

The Land Before Time X: The Great Longneck Migration
The Land Before Time IV: Journey Through the Mists
The Land Before Time VIII: The Big Freeze
The Land Before Time II: The Great Valley Adventure
The Land Before Time III: The Time of the Great Giving


In [56]:
recomendacion('Stuart Little 3: Call of the Wild')

Stuart Little
Stuart Little 2
The Brave Little Toaster to the Rescue
The Fox and the Hound
The Brave Little Toaster Goes to Mars


In [57]:
recomendacion('Batman vs. Robin')

Batman Unlimited: Monster Mayhem
Batman: Bad Blood
Son of Batman
Batman Beyond: Return of the Joker
Batman Beyond: The Movie


In [58]:
recomendacion('Pirates of the Caribbean: The Curse of the Black Pearl')

Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: On Stranger Tides
Pirates of the Caribbean: Dead Men Tell No Tales
Pirates of the Caribbean: At World's End
Transporter 3


In [59]:
recomendacion('Star Trek Into Darkness')

Star Trek Beyond
Star Trek: Generations
Star Trek: First Contact
Star Trek III: The Search for Spock
Star Trek: Insurrection


In [60]:
recomendacion('Fast Five')

Fast & Furious 6
Fast & Furious
2 Fast 2 Furious
The Fast and the Furious
The Fate of the Furious


In [61]:
recomendacion('Superman III')

Superman
Superman II
Superman IV: The Quest for Peace
Superman Returns
The Batman Superman Movie: World's Finest


In [62]:
dataset_modelo.sample(5)

Unnamed: 0,title,tags
353,Rocky III,"Rocky Collection Drama Now the world champion,..."
1822,Hoodwinked Too! Hood VS. Evil,Hoodwinked! Collection Comedy Animation Family...
1797,The Little Mermaid: Ariel's Beginning,The Little Mermaid Collection Family Animation...
2949,The Escape,The Hire Thriller Action After the controversi...
1809,Resident Evil: Retribution,Resident Evil Collection Action Horror Science...


In [63]:
# codigo para guardar el dataset final para el modelo
# dataset_modelo.to_csv('modelo_database.csv', index=False)