ML

In [2]:
import pandas as pd
from ydata_profiling import ProfileReport
from surprise import Dataset , Reader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(r"ranks.csv")

In [3]:
# eliminamos la columna timestamp, ya que no es relevante en el modelo de recomendación
df = df.drop("timestamp" , axis=1)

In [4]:
# el modelo de ML funciona con strings, en todo caso empleo esta conversion de letras a números tratando de optimizar
import re
def convertir_letras_a_numeros(cadena):
    
    letras = re.findall('[a-zA-Z]', cadena)
    
    # convertir las letras en números utilizando la posición en el alfabeto
    numeros = [str(ord(letra.lower()) - 96) for letra in letras]
    
    # reemplazar las letras por los números en la cadena original
    for letra, numero in zip(letras, numeros):
        cadena = cadena.replace(letra, numero)
    
    return cadena

In [5]:
df['movieId'] = df['movieId'].apply(convertir_letras_a_numeros)

In [6]:
df["movieId"] = df["movieId"].astype(int)

In [7]:
# organizar el df
df = df.reindex(["userId" , "movieId", "rating"] , axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11024289 entries, 0 to 11024288
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   rating   float64
 2   movieId  int32  
dtypes: float64(1), int32(1), int64(1)
memory usage: 210.3 MB


In [8]:
# reader indicando el valor minimo y maximo de la puntuación
reader = Reader(rating_scale=(df["rating"].min(),df["rating"].max()))

In [8]:
# Dataset es el formato para que surprise lea el df. Pasamos el df con el reader
dataset = Dataset.load_from_df(df[["userId","movieId","rating"]],reader)

In [13]:
dataset

<surprise.dataset.DatasetAutoFolds at 0x20baa046560>

In [9]:
from surprise import SVD

In [None]:
# los mejores parametros son: n_epochs = 15 , lr_all = 0.005 , reg_all = 0.4
# esto de hacer multiples pruebas
from surprise.model_selection import GridSearchCV
param_grid = {"n_epochs":[10 , 15], "lr_all":[0.002 , 0.005], "reg_all": [0.4 , 0.6]}
gs = GridSearchCV(SVD , param_grid , measures=["rmse" , "fcp"] , cv=5 , refit=True)
gs.fit(dataset)

print(gs.best_score["rmse"])
print(gs.best_score["fcp"])
print(gs.best_params["rmse"])

# En todo este codigo lo que buscamos es hacer un cross validation para encontrar los mejores parametros

In [None]:
# Despues de correr este bloque, el modelo quedará listo
modelo = gs.best_estimator["rmse"]
modelo.fit(dataset.build_full_trainset())

De aqui para abajo son pruebas con datasets mas pequeños, subconjuntos del original

In [86]:
dfsub = df.loc[0:100000 ,"userId" : "rating"]

In [88]:
datasetsub = Dataset.load_from_df(dfsub , reader)

In [89]:
dfsub.shape

(100001, 3)

In [96]:
from surprise.model_selection import cross_validate

cross_validate(SVD() , datasetsub , measures=["RMSE" , "MAE"],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9659  0.9608  0.9661  0.9574  0.9578  0.9616  0.0038  
MAE (testset)     0.7535  0.7453  0.7509  0.7474  0.7474  0.7489  0.0029  
Fit time          1.14    1.12    1.14    1.12    1.13    1.13    0.01    
Test time         0.10    0.10    0.36    0.10    0.10    0.15    0.11    


{'test_rmse': array([0.96591763, 0.96076454, 0.96606243, 0.95740206, 0.95784531]),
 'test_mae': array([0.75348939, 0.74527194, 0.75094945, 0.74737894, 0.74735664]),
 'fit_time': (1.1437618732452393,
  1.1212449073791504,
  1.1358680725097656,
  1.116645097732544,
  1.1317555904388428),
 'test_time': (0.09899544715881348,
  0.0975487232208252,
  0.362884521484375,
  0.09833192825317383,
  0.09702038764953613)}

In [None]:
dfsub

Unnamed: 0,userId,movieId,rating
0,1,119680,1.0
1,1,14192186,4.5
2,1,8192381,5.0
3,1,14193663,5.0
4,1,1199500,5.0
...,...,...,...
9996,120,14198204,4.0
9997,120,8191897,4.5
9998,120,1419122,3.5
9999,120,14193221,3.0


In [103]:
from surprise.model_selection import GridSearchCV
param_grid = {"n_epochs":[15], "lr_all":[0.005], "reg_all": [0.4 , 0.6]}
gs = GridSearchCV(SVD , param_grid , measures=["rmse" , "fcp"] , cv=5 , refit=True)
gs.fit(datasetsub)

print(gs.best_score["rmse"])
print(gs.best_score["fcp"])
print(gs.best_params["rmse"])

0.9518687703914489
0.5014030188724812
{'n_epochs': 15, 'lr_all': 0.005, 'reg_all': 0.4}


In [104]:
model = gs.best_estimator["rmse"]
model.fit(datasetsub.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2a7a9bfed70>

In [106]:
u = 120
mid=1419828		
modelo.predict(u , mid ,r_ui=3.5 ,  verbose=True)

user: 120        item: 1419828    r_ui = 3.50   est = 3.54   {'was_impossible': False}


Prediction(uid=120, iid=1419828, r_ui=3.5, est=3.540752950051133, details={'was_impossible': False})