## Recomendador: Filtrado Colaborativo Ejercicio

Utilizado los datasets **`movies.csv`** y **`ratings`**, realiza un algoritmo de recomendación de filtrado colaborativo para la información dada por **`usuario_activo`**.

Has preprocesamiento para el nombre de las peliculas y utiliza el dataset de **`ratings.csv`** para encontrar los usuarios más correlacionados con **`usuario_activo`**.

In [1]:
import numpy as np
import pandas as pd

# Correlación de Pearson
from scipy.stats  import pearsonr

In [2]:
df_movies = pd.read_csv("movies.csv")

df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
df_ratings = pd.read_csv("ratings.csv")

df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [4]:
usuario_activo = [{"title" : "Breakfast Club, The", "rating" : 5  },
                  {"title" : "Toy Story"          , "rating" : 3.5},
                  {"title" : "Jumanji"            , "rating" : 2  },
                  {"title" : "Pulp Fiction"       , "rating" : 5  },
                  {"title" : "Akira"              , "rating" : 4.5}]    

usuario_activo

[{'title': 'Breakfast Club, The', 'rating': 5},
 {'title': 'Toy Story', 'rating': 3.5},
 {'title': 'Jumanji', 'rating': 2},
 {'title': 'Pulp Fiction', 'rating': 5},
 {'title': 'Akira', 'rating': 4.5}]

In [5]:
usuario = pd.DataFrame(usuario_activo)

In [6]:
usuario

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [7]:
df_movies.drop("genres", axis = 1, inplace = True)
df_ratings.drop("timestamp", axis = 1, inplace = True)

In [8]:
df_movies["title"] = df_movies["title"].apply(lambda x: x[:-7])

In [9]:
indice_vista_usuario = df_movies[df_movies["title"].isin(usuario["title"])]["movieId"]
indice_vista_usuario

0          1
1          2
293      296
1246    1274
1885    1968
Name: movieId, dtype: int64

In [10]:
df_grupos_vistas = df_ratings[df_ratings["movieId"].isin(indice_vista_usuario)]
df_grupos_vistas

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0
...,...,...,...
22883679,247738,296,4.0
22884132,247751,1,4.0
22884142,247751,296,4.0
22884164,247751,1274,5.0


In [11]:
df_grupos_vistas_groupby = df_grupos_vistas.groupby("userId").count()

indices_usuarios_vistos = df_grupos_vistas_groupby[df_grupos_vistas_groupby["movieId"] == 5].index

indices_usuarios_vistos

Int64Index([    75,    106,    686,    815,   1040,   1130,   1502,   1599,
              1625,   1950,
            ...
            245719, 246207, 246393, 246481, 246487, 246506, 246533, 246959,
            246999, 247307],
           dtype='int64', name='userId', length=1417)

In [12]:
df_grupos_vistas = df_grupos_vistas[df_grupos_vistas["userId"].isin(indices_usuarios_vistos)]
df_grupos_vistas

Unnamed: 0,userId,movieId,rating
7507,75,1,5.0
7508,75,2,3.5
7540,75,296,5.0
7633,75,1274,4.5
7673,75,1968,5.0
...,...,...,...
22842095,247307,1,4.0
22842096,247307,2,4.5
22842125,247307,296,2.0
22842207,247307,1274,3.0


In [13]:
usuario["rating"]

0    5.0
1    3.5
2    2.0
3    5.0
4    4.5
Name: rating, dtype: float64

In [14]:
df_grupos_vistas[df_grupos_vistas["userId"] == 75]["rating"]

7507    5.0
7508    3.5
7540    5.0
7633    4.5
7673    5.0
Name: rating, dtype: float64

In [15]:
pearsonr(usuario["rating"], df_grupos_vistas[df_grupos_vistas["userId"] == 75]["rating"])[0]

0.07520710469952335

In [16]:
lista_valores_corr = list()

for usu in indices_usuarios_vistos:
    
    correlacion = pearsonr(usuario["rating"], df_grupos_vistas[df_grupos_vistas["userId"] == usu]["rating"])[0]
    lista_valores_corr.append(correlacion)
    
lista_valores_corr



[0.07520710469952335,
 -0.5860090386731193,
 0.2773500981126146,
 -0.3668996928526715,
 0.052414241836095915,
 -0.3180732125814322,
 -0.36544084137792887,
 -0.8770580193070293,
 -0.6266005147845038,
 -0.3580574370197165,
 -0.8770580193070293,
 -0.11720180773462387,
 -0.8320502943378437,
 0.2192645048267573,
 -0.11720180773462385,
 -0.35160542320387156,
 -0.827278151694757,
 -0.17902871850985827,
 -0.6784622064861938,
 -0.7017294652672371,
 0.716114874039433,
 0.9776923610938038,
 0.7032108464077431,
 0.7844645405527362,
 -0.21926450482675733,
 -0.05967623950328606,
 -0.5871513575713986,
 0.15041420939904673,
 -0.08006407690254358,
 -0.4478803601143137,
 -0.35630482034348065,
 0.0,
 -0.4300130725961134,
 -0.2364389962054795,
 -0.038461538461538464,
 -0.3837128834468253,
 0.1172018077346238,
 -0.8204126541423671,
 -0.6316511701147547,
 -0.35160542320387156,
 0.26854307776478736,
 0.35805743701971643,
 -0.5264497328966635,
 0.41602514716892186,
 -0.716114874039433,
 -0.5160156871153361,
 

In [17]:
dicc_valores_corr = dict()

for usu in indices_usuarios_vistos:
    
    correlacion = pearsonr(usuario["rating"], df_grupos_vistas[df_grupos_vistas["userId"] == usu]["rating"])[0]
    dicc_valores_corr[usu] = correlacion
    
dicc_valores_corr

{75: 0.07520710469952335,
 106: -0.5860090386731193,
 686: 0.2773500981126146,
 815: -0.3668996928526715,
 1040: 0.052414241836095915,
 1130: -0.3180732125814322,
 1502: -0.36544084137792887,
 1599: -0.8770580193070293,
 1625: -0.6266005147845038,
 1950: -0.3580574370197165,
 2065: -0.8770580193070293,
 2128: -0.11720180773462387,
 2432: -0.8320502943378437,
 2791: 0.2192645048267573,
 2839: -0.11720180773462385,
 2948: -0.35160542320387156,
 3025: -0.827278151694757,
 3040: -0.17902871850985827,
 3186: -0.6784622064861938,
 3271: -0.7017294652672371,
 3429: 0.716114874039433,
 3734: 0.9776923610938038,
 4099: 0.7032108464077431,
 4208: 0.7844645405527362,
 4282: -0.21926450482675733,
 4292: -0.05967623950328606,
 4415: -0.5871513575713986,
 4586: 0.15041420939904673,
 4725: -0.08006407690254358,
 4818: -0.4478803601143137,
 5104: -0.35630482034348065,
 5165: 0.0,
 5547: -0.4300130725961134,
 6082: -0.2364389962054795,
 6207: -0.038461538461538464,
 6366: -0.3837128834468253,
 6482: 0.

In [18]:
# Mejores 50

n = 100

df_aux = pd.DataFrame(dicc_valores_corr.items(), columns = ["userId", "Corr"]).sort_values("Corr", ascending = False).iloc[:n, :]

dicc_valores_corr = {indice : corr for indice, corr in zip(df_aux["userId"], df_aux["Corr"])}

dicc_valores_corr

{93491: 0.986690271720408,
 214568: 0.986690271720408,
 62531: 0.9777204440006757,
 3734: 0.9776923610938038,
 51672: 0.9766817311218655,
 193582: 0.9376144618769908,
 61725: 0.9176462238110024,
 72256: 0.9024852563942805,
 21362: 0.9024852563942803,
 144825: 0.8984681855808222,
 245274: 0.8976095575314934,
 194974: 0.8951435925492912,
 172253: 0.8951435925492912,
 73416: 0.8951435925492912,
 169107: 0.8951435925492912,
 131052: 0.8951435925492912,
 142310: 0.8951435925492911,
 56705: 0.8875203139603667,
 221409: 0.8825226081218283,
 37148: 0.8770580193070292,
 210892: 0.87131213982461,
 62494: 0.8666253521790723,
 216906: 0.8648817040445187,
 137143: 0.8600261451922268,
 221499: 0.8596888138808808,
 244212: 0.8594395636904107,
 120791: 0.8320502943378437,
 169553: 0.8320502943378437,
 210835: 0.827278151694757,
 48993: 0.8204126541423671,
 160457: 0.8204126541423671,
 169491: 0.820412654142367,
 173864: 0.820412654142367,
 138136: 0.8096878445391197,
 166521: 0.8092803518889831,
 4208

In [19]:
df_pelis_no_vistas = df_ratings[df_ratings["userId"].isin(dicc_valores_corr.keys()) & ~df_ratings["movieId"].isin(indice_vista_usuario)]
df_pelis_no_vistas

Unnamed: 0,userId,movieId,rating
316847,3429,6,5.0
316848,3429,9,3.0
316849,3429,10,3.0
316850,3429,15,3.0
316851,3429,20,3.0
...,...,...,...
22677977,245568,81847,4.5
22677978,245568,82169,4.0
22677979,245568,82459,4.0
22677980,245568,82461,4.5


In [20]:
df_pelis_no_vistas["userId"] = df_pelis_no_vistas["userId"].replace(dicc_valores_corr).copy()
df_pelis_no_vistas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pelis_no_vistas["userId"] = df_pelis_no_vistas["userId"].replace(dicc_valores_corr).copy()


Unnamed: 0,userId,movieId,rating
316847,0.716115,6,5.0
316848,0.716115,9,3.0
316849,0.716115,10,3.0
316850,0.716115,15,3.0
316851,0.716115,20,3.0
...,...,...,...
22677977,0.661107,81847,4.5
22677978,0.661107,82169,4.0
22677979,0.661107,82459,4.0
22677980,0.661107,82461,4.5


In [21]:
df_pelis_no_vistas["Pesos"] = df_pelis_no_vistas["userId"] * df_pelis_no_vistas["rating"]
df_pelis_no_vistas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pelis_no_vistas["Pesos"] = df_pelis_no_vistas["userId"] * df_pelis_no_vistas["rating"]


Unnamed: 0,userId,movieId,rating,Pesos
316847,0.716115,6,5.0,3.580574
316848,0.716115,9,3.0,2.148345
316849,0.716115,10,3.0,2.148345
316850,0.716115,15,3.0,2.148345
316851,0.716115,20,3.0,2.148345
...,...,...,...,...
22677977,0.661107,81847,4.5,2.974983
22677978,0.661107,82169,4.0,2.644429
22677979,0.661107,82459,4.0,2.644429
22677980,0.661107,82461,4.5,2.974983


In [22]:
df_1 = df_pelis_no_vistas.groupby("movieId", as_index = False).agg({"Pesos" : "sum"})
df_2 = df_pelis_no_vistas.groupby("movieId", as_index = False).agg({"userId" : "sum"})

df_3 = pd.merge(df_1, df_2, on = "movieId")

df_3["Resultado"] = df_3["Pesos"]/df_3["userId"]

df_3 = pd.merge(df_3, df_movies, on = "movieId")

df_3.sort_values("Resultado", ascending = False)

Unnamed: 0,movieId,Pesos,userId,Resultado,title
9714,104069,6.636150,1.327230,5.0,Louis C.K.: Oh My God
3734,4763,4.324409,0.864882,5.0,"Iron Ladies, The (Satree lek)"
9761,105497,3.788314,0.757663,5.0,Kumail Nanjiani: Beta Male
10379,136447,2.847836,0.569567,5.0,George Carlin: You Are All Diseased
6224,26147,2.827014,0.565403,5.0,"Thousand Clowns, A"
...,...,...,...,...,...
6796,33410,0.429844,0.859689,0.5,Crackerjack
1136,1490,0.583964,1.167927,0.5,B*A*P*S
4797,6355,0.493345,0.986690,0.5,"Girls, Les"
9164,90397,0.328897,0.657794,0.5,Return of Django (Son of Django) (Figlio di Dj...


In [23]:
############################################################################################################################