# 1. PREPARACIÓN NOTEBOOK

## 1.1 Librerías

In [2]:
!pip install lightfm
!pip install --quiet optuna

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831163 sha256=590929b72b7d99a3635df4f66329d3df2cc34dafa056de8bbac42672b47915c5
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import sys
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score

import optuna

import sqlite3 as sql
import joblib

from ipywidgets import interact

import time

from google.colab import drive
import sys
import os

## 1.2 Conexión y carga de datos

In [4]:
# Montar Google Drive
drive.mount('/content/drive')

# Definir rutas
base_path = '/content/drive/My Drive/cod/A3_marketing'
data_path = os.path.join(base_path, 'data')
output_path = os.path.join(base_path, 'salidas')

# Asegurarse de que las carpetas existen
os.makedirs(data_path, exist_ok=True)
os.makedirs(output_path, exist_ok=True)

# Cargar el DataFrame escalado
df_scaled = joblib.load(os.path.join(output_path, 'df_scaled.joblib'))

# Conectarse a la base de datos SQLite
db_path = os.path.join(data_path, 'db_movies_c')  # Asegúrate de que este archivo exista y sea una base SQLite válida
con = sql.connect(db_path)

# Cargar tabla desde la base de datos
movies = pd.read_sql('SELECT * FROM tabla_final', con)

# Mostrar una vista previa
print(movies.head())

Mounted at /content/drive
   movieId                        title    year  Action  Adventure  Animation  \
0        1                    Toy Story  1995.0       0          1          1   
1        2                      Jumanji  1995.0       0          1          0   
2        3             Grumpier Old Men  1995.0       0          0          0   
3        5  Father of the Bride Part II  1995.0       0          0          0   
4        6                         Heat  1995.0       1          0          0   

   Children  Comedy  Crime  Documentary  ...  Musical  Mystery  Romance  \
0         1       1      0            0  ...        0        0        0   
1         1       0      0            0  ...        0        0        0   
2         0       1      0            0  ...        0        0        1   
3         0       1      0            0  ...        0        0        0   
4         0       0      1            0  ...        0        0        0   

   Sci-Fi  Thriller  War  Western  d

# 2. MODELOS

## 2.1 Recomendación basada en contenido

In [5]:
#Sistema KNN con base en todo lo visto por el usuario recomienda algo no visto

# Seleccionar usuario para recomendaciones
usuarios = pd.read_sql('SELECT DISTINCT userId as user_id FROM ratings', con)
user_id = 1  # Ejemplo manual

def recomendar(user_id=1):

    # Seleccionar solo los ratings del usuario seleccionado
    ratings = pd.read_sql('SELECT * FROM ratings WHERE userId = :user', con, params={'user': user_id})
    l_movies_r = ratings['movieId'].to_numpy()

    # Agregar columnas necesarias para mostrar luego
    df_scaled[['movieId', 'title']] = movies[['movieId', 'title']]

    # Filtrar películas calificadas por el usuario
    movies_r = df_scaled[df_scaled['movieId'].isin(l_movies_r)]

    # Eliminar columnas no numéricas
    movies_r = movies_r.drop(columns=['movieId', 'title'])
    movies_r["indice"] = 1
    centroide = movies_r.groupby("indice").mean()

    # Filtrar películas no vistas
    movies_nr = df_scaled[~df_scaled['movieId'].isin(l_movies_r)]
    movies_nr_simple = movies_nr.drop(columns=['movieId', 'title'])

    # Modelo KNN
    model = neighbors.NearestNeighbors(n_neighbors=11, metric='cosine')
    model.fit(movies_nr_simple)
    dist, idlist = model.kneighbors(centroide)

    ids = idlist[0]
    recomend_m = movies_nr.iloc[ids][['title', 'movieId']]
    leidos = movies[movies['movieId'].isin(l_movies_r)][['title', 'movieId']]

    return recomend_m

# Ejemplo de recomendación para el usuario con ID = 1
recomendar(1)
interact(recomendar)

interactive(children=(IntSlider(value=1, description='user_id', max=3, min=-1), Output()), _dom_classes=('widg…

## 2.2 Recomendación LightFM

In [6]:
#Recomendación con LightFM según lo que ha visto

# Cargar data
ratings = pd.read_sql('SELECT * FROM ratings', con)

# Crear dataset
dataset_train = Dataset()
dataset_test = Dataset()

all_unique_users = ratings['userId'].unique()
all_unique_items = ratings['movieId'].unique()

dataset_train.fit(users=all_unique_users, items=all_unique_items)
dataset_test.fit(users=all_unique_users, items=all_unique_items)

# Separar en train y test
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

# Crear interacciones
train_interactions_list = [(row['userId'], row['movieId'], row['rating']) for index, row in train_df.iterrows()]
train_interactions, train_weights = dataset_train.build_interactions(train_interactions_list)

test_interactions_list = [(row['userId'], row['movieId'], row['rating']) for index, row in test_df.iterrows()]
test_interactions, test_weights = dataset_test.build_interactions(test_interactions_list)

# Entrenamiento del modelo LightFM
model = LightFM(loss='logistic', random_state=42)
model.fit(train_interactions, epochs=20, verbose=True, sample_weight=train_weights)

# Evaluación AUC
train_auc = auc_score(model, train_interactions).mean()
test_auc = auc_score(model, test_interactions).mean()
print(f'AUC: train {train_auc:.2f}, test {test_auc:.2f}')

Epoch: 100%|██████████| 20/20 [00:02<00:00,  8.14it/s]


AUC: train 0.92, test 0.89


### 2.2.1 Ajuste de hiperparámetros con Optuna

In [7]:
def objective(trial):
    n_components = trial.suggest_int('no_components', 10, 100)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True)
    loss = trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
    epochs = trial.suggest_int('epochs', 10, 50)

    model = LightFM(
        no_components=n_components,
        learning_rate=learning_rate,
        loss=loss,
        random_state=42
    )

    model.fit(train_interactions, epochs=epochs, verbose=False, sample_weight=train_weights)
    test_auc = auc_score(model, test_interactions).mean()
    return test_auc

# Realizar búsqueda de hiperparámetros con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print(f"Best trial AUC: {study.best_value:.4f}")
print(f"Best Params: {study.best_params}")

[I 2025-05-13 00:18:26,806] A new study created in memory with name: no-name-0d4966dd-ffca-45c4-9031-d7b8f6477d59
[I 2025-05-13 00:18:40,850] Trial 0 finished with value: 0.9118791222572327 and parameters: {'no_components': 31, 'learning_rate': 0.0729796518565974, 'loss': 'warp', 'epochs': 37}. Best is trial 0 with value: 0.9118791222572327.
[I 2025-05-13 00:18:50,647] Trial 1 finished with value: 0.890369176864624 and parameters: {'no_components': 70, 'learning_rate': 0.0014863056350879348, 'loss': 'logistic', 'epochs': 33}. Best is trial 0 with value: 0.9118791222572327.
[I 2025-05-13 00:19:01,063] Trial 2 finished with value: 0.8925673961639404 and parameters: {'no_components': 70, 'learning_rate': 0.04864734303156925, 'loss': 'logistic', 'epochs': 43}. Best is trial 0 with value: 0.9118791222572327.
[I 2025-05-13 00:19:10,577] Trial 3 finished with value: 0.8903974890708923 and parameters: {'no_components': 72, 'learning_rate': 0.00214770632342982, 'loss': 'logistic', 'epochs': 36}

Best trial AUC: 0.9119
Best Params: {'no_components': 31, 'learning_rate': 0.0729796518565974, 'loss': 'warp', 'epochs': 37}


### 2.2.2 Generar recomendaciones con modelo LightFM

In [9]:
def recommendation(model, data, original_user_id, conn, k):
    df_all = pd.read_sql(f"SELECT * FROM ratings", con)
    movie_ids_all = df_all['movieId'].unique()

    # Usuario en ID interno
    uid_index = data.mapping()[0][original_user_id]
    item_ids = [value for key, value in data.mapping()[2].items()]
    scores = model.predict(uid_index, item_ids)

    sorted_indices = np.argsort(-scores)
    top_items = [key for key, value in data.mapping()[2].items() if value in sorted_indices[:k]]

    full_movies = pd.read_sql("SELECT DISTINCT movieId, title FROM tabla_final", con)
    recommended = full_movies[full_movies['movieId'].isin(top_items)]

    return recommended

# Obtener recomendaciones para el usuario con ID = 1
recommendation(model, dataset_train, 1, con, 10)

Unnamed: 0,movieId,title
33,110,Braveheart
38,150,Apollo 13
68,260,Star Wars: Episode IV - A New Hope
79,296,Pulp Fiction
87,318,"Shawshank Redemption, The"
100,356,Forrest Gump
131,527,Schindler's List
144,589,Terminator 2: Judgment Day
147,593,"Silence of the Lambs, The"
522,2571,"Matrix, The"
