In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.models import Model

# --- Cargar dataset ---
df = pd.read_csv("dataset_final.csv")

# --- Mapear userId y movieId a índices ---
unique_users = df['userId'].unique()
unique_movies = df['movieId'].unique()

user_id_map = {uid: idx for idx, uid in enumerate(unique_users)}
movie_id_map = {mid: idx for idx, mid in enumerate(unique_movies)}

df['userId'] = df['userId'].map(user_id_map)
df['movieId'] = df['movieId'].map(movie_id_map)

# --- Guardar mapeos para luego predecir ---
joblib.dump(user_id_map, 'user_id_map.pkl')
joblib.dump(movie_id_map, 'movie_id_map.pkl')

# --- Separar features y target ---
feature_cols = [
    'temporal_1', 'temporal_2', 'temporal_3',
    'rating_previous', 'is_weekend', 'season_encoded', 'is_holiday',
    'year', 'month', 'weekday',
    'consumo_semanal_usuario', 'antiguedad_rating', 'diferencia_rating_anterior'
]

X_user = df['userId'].values
X_movie = df['movieId'].values
X_others = df[feature_cols].values.reshape((df.shape[0], 1, len(feature_cols)))
y = df['rating'].values

# --- Separar train/test ---
X_train_user, X_test_user, X_train_movie, X_test_movie, X_train_others, X_test_others, y_train, y_test = train_test_split(
    X_user, X_movie, X_others, y, test_size=0.2, random_state=42
)

# --- Definir modelo ---
n_users = len(user_id_map)
n_movies = len(movie_id_map)

user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')
other_input = Input(shape=(1, len(feature_cols)), name='other_features_input')

user_emb = Embedding(input_dim=n_users+1, output_dim=16, name='user_embedding')(user_input)
movie_emb = Embedding(input_dim=n_movies+1, output_dim=16, name='movie_embedding')(movie_input)

user_vec = Flatten()(user_emb)
movie_vec = Flatten()(movie_emb)

concat_emb = Concatenate()([user_vec, movie_vec])
concat_emb = tf.expand_dims(concat_emb, axis=1)  # Lo mismo que reshape para concatenar

x = Concatenate()([concat_emb, other_input])

x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='linear')(x)

model = Model(inputs=[user_input, movie_input, other_input], outputs=output)
model.compile(optimizer='adam', loss='mse')

# --- Entrenar ---
model.fit(
    [X_train_user, X_train_movie, X_train_others],
    y_train,
    epochs=10,
    batch_size=64,
    validation_data=([X_test_user, X_test_movie, X_test_others], y_test)
)

2025-05-14 19:22:59.956327: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-14 19:22:59.956419: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-14 19:23:00.061873: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-14 19:23:00.280632: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-14 19:23:12.712067: I external/local_xla/xla/

Epoch 1/10


2025-05-14 19:23:15.948580: I external/local_xla/xla/service/service.cc:168] XLA service 0x770529e72780 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-05-14 19:23:15.948616: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050, Compute Capability 8.6
2025-05-14 19:23:15.972861: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-05-14 19:23:16.034721: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1747268596.147892    7203 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7706cf5887f0>

In [2]:
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# 🔹 12. Evaluación del modelo en test (loss es MSE, rmse lo calculamos manualmente)
loss = model.evaluate([X_test_user, X_test_movie, X_test_others], y_test, verbose=1)
print(f"\n✅ Loss (MSE) final de la RNN: {loss:.4f}")

# Predicciones continuas
y_pred_continuous = model.predict([X_test_user, X_test_movie, X_test_others]).flatten()

# Calcular RMSE manual
rmse_manual = np.sqrt(mean_squared_error(y_test, y_pred_continuous))
print(f"✅ RMSE (cálculo manual): {rmse_manual:.4f}")

# Redondear para clasificación aproximada
y_pred_class = np.round(y_pred_continuous)
y_true_class = np.round(y_test)

# Métricas de clasificación (solo si quieres evaluar como clasificación aproximada)
accuracy = accuracy_score(y_true_class, y_pred_class)
precision = precision_score(y_true_class, y_pred_class, average='weighted', zero_division=0)
recall = recall_score(y_true_class, y_pred_class, average='weighted')
f1 = f1_score(y_true_class, y_pred_class, average='weighted')

print(f"✅ Accuracy (clasificación redondeada): {accuracy:.4f}")
print(f"✅ Precision (clasificación redondeada): {precision:.4f}")
print(f"✅ Recall (clasificación redondeada): {recall:.4f}")
print(f"✅ F1-Score (clasificación redondeada): {f1:.4f}")

# 🔹 14. Guardar modelo
model.save("edu.h5")
print("✅ Modelo y mapas guardados exitosamente.")



✅ Loss (MSE) final de la RNN: 0.0000
✅ RMSE (cálculo manual): 0.0034
✅ Accuracy (clasificación redondeada): 0.8375
✅ Precision (clasificación redondeada): 0.8916
✅ Recall (clasificación redondeada): 0.8375
✅ F1-Score (clasificación redondeada): 0.8342
✅ Modelo y mapas guardados exitosamente.


  saving_api.save_model(


In [6]:
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 movie_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 user_embedding (Embedding)  (None, 1, 16)                3206160   ['user_input[0][0]']          
                                                                                                  
 movie_embedding (Embedding  (None, 1, 16)                814080    ['movie_input[0][0]']         
 )                                                                                            

In [8]:
# 📚 Predecir en el test set
y_pred_continuous = model.predict([X_test_user, X_test_movie, X_test_others]).flatten()

# 🔹 Seleccionamos 5 ejemplos aleatorios para mostrar
import random

# Escogemos 5 índices aleatorios
random_indices = random.sample(range(len(y_test)), 5)

print("\n Ejemplos de predicción:\n")
for idx in random_indices:
    print(f"Usuario ID: {X_test_user[idx]}")
    print(f"Película ID: {X_test_movie[idx]}")
    print(f"Rating real: {y_test.iloc[idx]:.1f}")
    print(f"Rating predicho: {y_pred_continuous[idx]:.2f}")
    print("-" * 40)



 Ejemplos de predicción:

Usuario ID: 57248
Película ID: 1704


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [9]:
# 📚 Análisis estadístico de las predicciones
import numpy as np

min_rating = np.min(y_pred_continuous)
max_rating = np.max(y_pred_continuous)
mean_rating = np.mean(y_pred_continuous)
std_rating = np.std(y_pred_continuous)

print("\n Análisis de las predicciones:\n")
print(f" Mínimo rating predicho: {min_rating:.2f}")
print(f" Máximo rating predicho: {max_rating:.2f}")
print(f" Promedio de ratings predichos: {mean_rating:.2f}")
print(f" Desviación estándar de ratings predichos: {std_rating:.2f}")



 Análisis de las predicciones:

 Mínimo rating predicho: 0.87
 Máximo rating predicho: 4.99
 Promedio de ratings predichos: 3.56
 Desviación estándar de ratings predichos: 1.05


In [29]:
import pandas as pd
import numpy as np
import joblib
import holidays

# 1. Cargar dataset histórico (con ratings)
df_train = pd.read_csv("dataset_final.csv", sep=',')  # ajusta sep según tu archivo
df_train = df_train.sort_values(['userId', 'timestamp'])

# 2. Cargar dataset para predecir (sin ratings)
df_pred = pd.read_csv("test.csv", sep=';')  # ajusta sep

# 3. Combinar histórico y predicción para calcular variables históricas
df_train_sub = df_train[['userId', 'movieId', 'timestamp', 'rating']]
df_pred_sub = df_pred[['userId', 'movieId', 'timestamp']]

df_combined = pd.concat([df_train_sub, df_pred_sub], ignore_index=True, sort=False)
df_combined = df_combined.sort_values(['userId', 'timestamp'])

# 4. Calcular rating_previous y diferencia_rating_anterior
df_combined['rating_previous'] = df_combined.groupby('userId')['rating'].shift().fillna(0)
df_combined['diferencia_rating_anterior'] = df_combined['rating'] - df_combined['rating_previous']

# 5. Extraer solo filas correspondientes a datos de predicción
df_pred_updated = df_combined.loc[df_combined.index >= len(df_train)].copy()

# 6. Rellenar NaN si es necesario
df_pred_updated['rating_previous'] = df_pred_updated['rating_previous'].fillna(0)
df_pred_updated['diferencia_rating_anterior'] = df_pred_updated['diferencia_rating_anterior'].fillna(0)

# 7. Añadir esas columnas a df_pred original (asegúrate que los índices coincidan)
df_pred['rating_previous'] = df_pred_updated['rating_previous'].values
df_pred['diferencia_rating_anterior'] = df_pred_updated['diferencia_rating_anterior'].values

# 8. Crear variables temporales y demás características
df_pred['date'] = pd.to_datetime(df_pred['timestamp'], unit='s')
df_pred['year'] = df_pred['date'].dt.year
df_pred['month'] = df_pred['date'].dt.month
df_pred['weekday'] = df_pred['date'].dt.weekday

def get_season(month):
    if month in [12, 1, 2]: return 0
    elif month in [3, 4, 5]: return 1
    elif month in [6, 7, 8]: return 2
    else: return 3

df_pred['season_encoded'] = df_pred['month'].apply(get_season)
df_pred['is_weekend'] = df_pred['weekday'].isin([5, 6]).astype(int)

ecu_holidays = holidays.Ecuador(years=range(df_pred['year'].min(), df_pred['year'].max() + 1))
df_pred['is_holiday'] = df_pred['date'].isin(ecu_holidays).astype(int)

df_pred['week_of_year'] = df_pred['date'].dt.isocalendar().week.astype(int)
df_pred['consumo_semanal_usuario'] = 0
df_pred['antiguedad_rating'] = 2025 - df_pred['year']

# 9. Cargar scaler, UMAP y diccionarios
scaler = joblib.load('scaler_temporal.pkl')
umap_model = joblib.load('umap_model.pkl')
user_id_map = joblib.load('user_id_map.pkl')
movie_id_map = joblib.load('movie_id_map.pkl')

# 10. Mapear userId y movieId
df_pred['userId'] = df_pred['userId'].map(user_id_map).fillna(-1).astype(int)
df_pred['movieId'] = df_pred['movieId'].map(movie_id_map).fillna(-1).astype(int)

# 11. Escalar y aplicar UMAP
temporal_features = ['year', 'month', 'weekday', 'season_encoded', 'is_weekend', 'is_holiday', 'rating_previous']
X_temp = scaler.transform(df_pred[temporal_features])
X_umap = umap_model.transform(X_temp)
df_pred[['temporal_1', 'temporal_2', 'temporal_3']] = X_umap

# 12. Preparar inputs para modelo
feature_cols = [
    'temporal_1', 'temporal_2', 'temporal_3',
    'rating_previous', 'is_weekend', 'season_encoded', 'is_holiday',
    'year', 'month', 'weekday',
    'consumo_semanal_usuario', 'antiguedad_rating', 'diferencia_rating_anterior'
]

X_pred_user = df_pred['userId'].values
X_pred_movie = df_pred['movieId'].values
X_pred_others = df_pred[feature_cols].values.reshape((df_pred.shape[0], 1, len(feature_cols)))

# 13. Cargar modelo y predecir
from tensorflow.keras.models import load_model
model = load_model("edu.h5")

predicted_ratings = model.predict([X_pred_user, X_pred_movie, X_pred_others])
# Redondear a múltiplos de 0.5
df_pred['predicted_rating'] = np.round(predicted_ratings.flatten() * 2) / 2

df_predicted_only = df_pred[['predicted_rating']]

# Guardar solo esa columna en CSV
df_predicted_only.to_csv("predicciones_arce_cordova.csv", index=False)


  df_pred['is_holiday'] = df_pred['date'].isin(ecu_holidays).astype(int)




In [17]:
predictions = model.predict([X_pred_user, X_pred_movie, X_pred_others])

df_pred['predicted_rating'] = predictions.flatten()  # asegúrate que es 1D

df_pred.to_csv('edu_pred2.csv', index=False)




In [23]:
df_train = pd.read_csv("dataset_final.csv", sep=';')
print(df_train.columns)


Index(['userId,movieId,temporal_1,temporal_2,temporal_3,rating,timestamp,rating_previous,is_weekend,season_encoded,is_holiday,year,month,weekday,consumo_semanal_usuario,antiguedad_rating,diferencia_rating_anterior'], dtype='object')


In [19]:
import numpy as np

# Ejemplo de 10 registros para predecir ratings
# user_id y movie_id son enteros indexados (ejemplo arbitrario)
user_ids = np.array([0, 0, 1, 2, 1, 3, 2, 4, 3, 0])
movie_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# 13 características para cada registro (arbitrarias y con algo de variación)
other_features = np.array([
    [5.1, 10.4, 6.2, 0, 1, 3, 0, 1999, 11, 6, 3, 26, 5],
    [5.0, 10.5, 6.0, 5, 1, 3, 0, 1999, 11, 6, 3, 26, 0],
    [-10.2, 5.3, 1.2, 0, 1, 3, 0, 1999, 11, 6, 3, 26, -4],
    [15.5, 17.6, -1.2, 1, 0, 3, 0, 1999, 11, 0, 3, 26, 1],
    [-5.0, 2.6, 3.8, 2, 0, 3, 0, 1999, 11, 0, 3, 26, 3],
    [15.0, 7.8, -8.8, 5, 0, 3, 0, 1999, 11, 0, 3, 26, 0],
    [1.0, 1.0, 1.0, 3, 0, 2, 1, 2000, 1, 1, 2, 10, -1],
    [3.0, 3.0, 3.0, 0, 1, 1, 0, 2001, 6, 3, 0, 12, 4],
    [7.5, 8.0, 7.0, 4, 1, 2, 0, 2002, 12, 5, 1, 14, 2],
    [0.0, 0.0, 0.0, 0, 0, 0, 0, 1998, 5, 4, 2, 20, 0],
])

# Reshape para que tenga (batch_size, 1, 13)
other_features = other_features.reshape((10, 1, 13))

# Suponiendo que tienes el modelo cargado como 'model'
predictions = model.predict([user_ids, movie_ids, other_features])

for i, pred in enumerate(predictions.flatten()):
    print(f"Ejemplo {i+1}: Predicción rating = {pred:.3f}")


Ejemplo 1: Predicción rating = 5.003
Ejemplo 2: Predicción rating = 5.002
Ejemplo 3: Predicción rating = -0.074
Ejemplo 4: Predicción rating = 2.001
Ejemplo 5: Predicción rating = 5.002
Ejemplo 6: Predicción rating = 5.002
Ejemplo 7: Predicción rating = 2.015
Ejemplo 8: Predicción rating = 4.004
Ejemplo 9: Predicción rating = 5.255
Ejemplo 10: Predicción rating = 0.263
