## Recomendador: Basado en Contenido Ejercicio

Utilizado el dataset **`movies.csv`**, realiza un algoritmo de recomendación basado en contenido para la información dada por **`usuario_input`**.

Has preprocesamiento para el nombre de las peliculas y utiliza la columna **`genres`** para descomponer las peliculas en sus categorias.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("movies.csv")

df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
34203,151697,Grand Slam (1967),Thriller
34204,151701,Bloodmoney (2010),(no genres listed)
34205,151703,The Butterfly Circus (2009),Drama
34206,151709,Zero (2015),Drama|Sci-Fi


In [3]:
df = df[df["genres"] != "(no genres listed)"][["title", "genres"]].reset_index(drop = True).copy()  
df

Unnamed: 0,title,genres
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy
...,...,...
33058,Hollywood High (1976),Comedy
33059,The Survivalist (2015),Drama|Sci-Fi|Thriller
33060,Grand Slam (1967),Thriller
33061,The Butterfly Circus (2009),Drama


In [4]:
df["title"] = df["title"].apply(lambda x: x[0:-7])

In [5]:
df.head(5)

Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance
3,Waiting to Exhale,Comedy|Drama|Romance
4,Father of the Bride Part II,Comedy


In [6]:
usuario_input = [{"title" : "Breakfast Club, The", "rating" : 5  },
                 {"title" : "Toy Story"          , "rating" : 3.5},
                 {"title" : "Jumanji"            , "rating" : 2  },
                 {"title" : "Pulp Fiction"       , "rating" : 5  },
                 {"title" : "Akira"              , "rating" : 4.5}]    

usuario_input

[{'title': 'Breakfast Club, The', 'rating': 5},
 {'title': 'Toy Story', 'rating': 3.5},
 {'title': 'Jumanji', 'rating': 2},
 {'title': 'Pulp Fiction', 'rating': 5},
 {'title': 'Akira', 'rating': 4.5}]

In [7]:
generos = list(set(df["genres"].apply(lambda x: x.split("|")).sum()))

In [8]:
generos

['Animation',
 'Children',
 'Comedy',
 'IMAX',
 'Western',
 'Adventure',
 'Musical',
 'Fantasy',
 'War',
 'Crime',
 'Sci-Fi',
 'Thriller',
 'Romance',
 'Film-Noir',
 'Mystery',
 'Horror',
 'Documentary',
 'Drama',
 'Action']

In [24]:
df_generos = pd.DataFrame(columns = sorted(generos))
df_generos

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western


In [25]:
%%time

lista_dfs = list()

for genero, titulo in zip(df["genres"], df["title"]):
    
    lista_01 = list()
    
    genero = genero.split("|")
    
    for gen in df_generos.columns:
        if gen in genero:
            lista_01.append(1)
        else :
            lista_01.append(0)
        
    lista_dfs.append(pd.DataFrame(data = [lista_01], columns = df_generos.columns, index = [titulo]))
    
df_generos = pd.concat(lista_dfs)          

Wall time: 51.9 s


In [26]:
df_generos.head(3)

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Toy Story,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Jumanji,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Grumpier Old Men,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [30]:
df_generos = df_generos.sort_index(axis = 0)
df_generos.head(3)

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
"""Great Performances"" Cats",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
#1 Cheerleader Camp,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [31]:
usuario_input

[{'title': 'Breakfast Club, The', 'rating': 5},
 {'title': 'Toy Story', 'rating': 3.5},
 {'title': 'Jumanji', 'rating': 2},
 {'title': 'Pulp Fiction', 'rating': 5},
 {'title': 'Akira', 'rating': 4.5}]

In [34]:
df_generos.loc["Breakfast Club, The", :]

Action         0
Adventure      0
Animation      0
Children       0
Comedy         1
Crime          0
Documentary    0
Drama          1
Fantasy        0
Film-Noir      0
Horror         0
IMAX           0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
Name: Breakfast Club, The, dtype: int64

In [41]:
peliculas_usuario = [elemento["title"] for elemento in usuario_input] 
peliculas_usuario

['Breakfast Club, The', 'Toy Story', 'Jumanji', 'Pulp Fiction', 'Akira']

In [42]:
puntuacion_usuario = [elemento["rating"] for elemento in usuario_input] 
puntuacion_usuario

[5, 3.5, 2, 5, 4.5]

In [81]:
df_generos.loc[peliculas_usuario, :]

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
"Breakfast Club, The",0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Toy Story,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Jumanji,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Pulp Fiction,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
Akira,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [45]:
puntuacion_generos = np.dot(puntuacion_usuario, df_generos.loc[peliculas_usuario, :])
puntuacion_generos

array([ 4.5, 10. ,  8. ,  5.5, 13.5,  5. ,  0. , 10. ,  5.5,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  4.5,  5. ,  0. ,  0. ])

In [46]:
puntuacion_generos = puntuacion_generos / puntuacion_generos.sum()
puntuacion_generos

array([0.06293706, 0.13986014, 0.11188811, 0.07692308, 0.18881119,
       0.06993007, 0.        , 0.13986014, 0.07692308, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.06293706, 0.06993007, 0.        , 0.        ])

In [82]:
resultados = np.dot(pd.Series(puntuacion_generos), df_generos.T)
resultados

array([0.21678322, 0.        , 0.32867133, ..., 0.18881119, 0.31468531,
       0.        ])

In [88]:
df_recomendacion = pd.DataFrame(data = [[a, b] for a, b in zip(df_generos.index, resultados)],
                                columns = ["Pelicula", "Recomendacion"]).sort_values("Recomendacion", ascending = False)

df_recomendacion

Unnamed: 0,Pelicula,Recomendacion
19192,Motorama,0.748252
32526,"Wonderful World of the Brothers Grimm, The",0.734266
23417,Revolutionary Girl Utena: Adolescence of Utena...,0.720280
32438,Wizards of Waverly Place: The Movie,0.685315
8598,Dragonheart 2: A New Beginning,0.678322
...,...,...
10869,Fright Night Part II,0.000000
10868,Fright Night 2: New Blood,0.000000
29667,Thunder Soul,0.000000
10859,Friends of God: A Road Trip with Alexandra Pelosi,0.000000


In [90]:
df_recomendacion.head(20)

Unnamed: 0,Pelicula,Recomendacion
19192,Motorama,0.748252
32526,"Wonderful World of the Brothers Grimm, The",0.734266
23417,Revolutionary Girl Utena: Adolescence of Utena...,0.72028
32438,Wizards of Waverly Place: The Movie,0.685315
8598,Dragonheart 2: A New Beginning,0.678322
14448,Interstate 60,0.678322
23861,Rubber,0.671329
27700,The 39 Steps,0.671329
29313,The Wrecking Crew,0.671329
32095,Who Framed Roger Rabbit?,0.664336


In [86]:
df_generos.loc["Motorama", :]

Action         0
Adventure      1
Animation      0
Children       0
Comedy         1
Crime          1
Documentary    0
Drama          1
Fantasy        1
Film-Noir      0
Horror         0
IMAX           0
Musical        0
Mystery        1
Romance        0
Sci-Fi         1
Thriller       1
War            0
Western        0
Name: Motorama, dtype: int64

In [101]:
############################################################################################################################

df_prueba =  df_generos.sample(100)

peliculas_prueba = df_prueba.index.to_list()

puntuacion_prueba = [np.random.randint(1, 6) for pelicula in peliculas_prueba]

df_prueba.shape, len(peliculas_prueba), len(puntuacion_prueba)

((100, 19), 100, 100)

In [102]:
puntuacion_generos = np.dot(puntuacion_prueba, df_prueba)
puntuacion_generos

array([ 27,  29,   9,  18,  80,  34,  32, 128,  27,   9,  39,   7,  12,
        22,  32,  11,  48,  18,   6], dtype=int64)

In [103]:
puntuacion_generos = puntuacion_generos / puntuacion_generos.sum()
puntuacion_generos

array([0.04591837, 0.04931973, 0.01530612, 0.03061224, 0.13605442,
       0.05782313, 0.05442177, 0.21768707, 0.04591837, 0.01530612,
       0.06632653, 0.01190476, 0.02040816, 0.03741497, 0.05442177,
       0.01870748, 0.08163265, 0.03061224, 0.01020408])

In [104]:
resultados = np.dot(pd.Series(puntuacion_generos), df_generos.T)
resultados

array([0.0952381 , 0.02040816, 0.3537415 , ..., 0.17346939, 0.11054422,
       0.06632653])

In [105]:
df_recomendacion = pd.DataFrame(data = [[a, b] for a, b in zip(df_generos.index, resultados)],
                                columns = ["Pelicula", "Recomendacion"]).sort_values("Recomendacion", ascending = False)

df_recomendacion[~df_recomendacion["Pelicula"].isin(peliculas_prueba)]

Unnamed: 0,Pelicula,Recomendacion
23861,Rubber,0.717687
27046,Svidd Neger,0.651361
17516,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,0.646259
17515,Lupin III: First Contact (Rupan Sansei: Faasut...,0.646259
19192,Motorama,0.644558
...,...,...
16833,Little Big Man,0.010204
11626,God's Gun,0.010204
28030,The Daughter of Dawn,0.010204
1926,"Appaloosa, The",0.010204
