
import pandas as pd
import numpy as np
from fastFM import sgd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy import sparse

df = pd.read_csv("dataset_final.csv")

X = df[['temporal_1', 'temporal_2', 'temporal_3']]
y = df['rating']

# 🔹 3. Normalizar features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convertir a sparse matrix (porque fastFM necesita sparse)
X_sparse = sparse.csr_matrix(X_scaled)

# 🔹 4. Separar en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y, test_size=0.2, random_state=42)

# 🔹 5. Crear el modelo Factorization Machine
fm = sgd.FMRegression(n_iter=1000, init_stdev=0.1, l2_reg_w=0.1, l2_reg_V=0.5, rank=8, random_state=42)

# 🔹 6. Entrenar el modelo
fm.fit(X_train, y_train)

# 🔹 7. Predecir
y_pred = fm.predict(X_test)

# 🔹 8. Evaluar con RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"\n🔍 RMSE obtenido en test: {rmse:.4f}")



### Entrenamiento

In [30]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Masking, TimeDistributed, Concatenate
from tensorflow.keras.callbacks import EarlyStopping

# === 1. Cargar dataset ===
df = pd.read_csv("dataset_final_3.csv")

# === 1A. Oversampling de ratings ≤ 2.0 ===
df_low = df[df['rating'] <= 2.0]
factor = int(np.ceil(df['rating'].value_counts().max() / len(df_low)))

# Concatenar varias veces y barajar
df = pd.concat([df] + [df_low] * factor, ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# === 2. Features usadas ===
temporal_features = [
    'temporal_1', 'temporal_2', 'temporal_3',
    'is_weekend', 'season_encoded',  # estos no se escalan
    'year', 'month', 'weekday',
    'consumo_semanal_usuario',
    'interactions_user', 'interactions_movie',
    'days_since_last_user_interaction'
]


# === 3. Codificar userId y movieId ===
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

df['user_id_enc'] = user_encoder.fit_transform(df['userId'])
df['movie_id_enc'] = movie_encoder.fit_transform(df['movieId'])

n_users = df['user_id_enc'].nunique()
n_movies = df['movie_id_enc'].nunique()

# === 4. Agrupar en secuencias ===
user_seqs = defaultdict(list)
movie_seqs = defaultdict(list)
rating_seqs = defaultdict(list)

for _, row in df.iterrows():
    uid = row['user_id_enc']
    mid = row['movie_id_enc']
    x = row[temporal_features].values.astype('float32')
    r = row['rating']
    user_seqs[uid].append(x)
    movie_seqs[uid].append(mid)
    rating_seqs[uid].append(r)

# === 5. Convertir a listas ===
X_seq = list(user_seqs.values())
movie_seq = list(movie_seqs.values())
y_seq = list(rating_seqs.values())

# === 6. Padding ===
maxlen = 50
X_padded = pad_sequences(X_seq, maxlen=maxlen, padding='pre', dtype='float32')
movie_padded = pad_sequences(movie_seq, maxlen=maxlen, padding='pre')
y_padded = pad_sequences(y_seq, maxlen=maxlen, padding='pre', dtype='float32')
user_ids = np.array(list(user_seqs.keys()))

# === 7. Separar en train/test ===
X_tr, X_te, M_tr, M_te, U_tr, U_te, Y_tr, Y_te = train_test_split(
    X_padded, movie_padded, user_ids, y_padded, test_size=0.2, random_state=42
)

# === 8. Modelo con embeddings ===
input_seq = Input(shape=(maxlen, X_tr.shape[2]), name='features_input')
input_uid = Input(shape=(1,), name='user_input')
input_mid = Input(shape=(maxlen,), name='movie_input')



In [31]:
# Embedding usuario (igual en todos los pasos → expandimos)
import tensorflow as tf
user_emb = Embedding(input_dim=n_users, output_dim=16, name='user_embedding')(input_uid)
user_emb = Dropout(0.2)(user_emb)
from tensorflow.keras.layers import Reshape

user_emb = Reshape((16,))(user_emb)  # convierte (None, 1, 16) → (None, 16)
user_emb = tf.keras.layers.RepeatVector(maxlen)(user_emb)  # ahora sí funciona

# Embedding por película por paso
movie_emb = Embedding(input_dim=n_movies, output_dim=16, name='movie_embedding')(input_mid)  # shape: (batch, maxlen, 16)

# Concatenar embeddings + features
x_concat = Concatenate(axis=-1)([input_seq, user_emb, movie_emb])

# Red LSTM
x = Masking(mask_value=0.0)(x_concat)
x = LSTM(64, return_sequences=True)(x)
x = Dropout(0.4)(x)
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda

output = TimeDistributed(Dense(1, activation='sigmoid'))(x)
output = Lambda(lambda y: y * 4.5 + 0.5)(output)  # mapea [0,1] -> [0.5, 5.0]
# Modelo final
model = Model(inputs=[input_seq, input_uid, input_mid], outputs=output)

model.compile(optimizer='adam', loss='huber', metrics=['mae', 'mse'])
model.summary()

# === 9. Entrenamiento ===
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    [X_tr, U_tr, M_tr],
    Y_tr[..., np.newaxis],  # expandir para (batch, steps, 1)
    validation_data=([X_te, U_te, M_te], Y_te[..., np.newaxis]),
    epochs=25,
    batch_size=128,
    callbacks=[early_stop],
    verbose=1
)
import joblib

# Luego de hacer fit
user_encoder = LabelEncoder()
user_encoder.fit(df['userId'])

movie_encoder = LabelEncoder()
movie_encoder.fit(df['movieId'])

# Guardar para usar en predicción después
joblib.dump(user_encoder, "user_encoder.pkl")
joblib.dump(movie_encoder, "movie_encoder.pkl")

# === 10. Evaluación ===
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = model.predict([X_te, U_te, M_te]).squeeze()

rmse = np.sqrt(mean_squared_error(Y_te.flatten(), y_pred.flatten()))
mae = mean_absolute_error(Y_te.flatten(), y_pred.flatten())

print(f"\n✅ RMSE: {rmse:.4f}")
print(f"✅ MAE: {mae:.4f}")

# === 11. Guardar modelo ===
model.save("rnn_model_with_embeddings4.h5")


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 user_embedding (Embedding)  (None, 1, 16)                3206144   ['user_input[0][0]']          
                                                                                                  
 dropout_7 (Dropout)         (None, 1, 16)                0         ['user_embedding[0][0]']      
                                                                                                  
 reshape_4 (Reshape)         (None, 16)                   0         ['dropout_7[0][0]']           
                                                                                            

  saving_api.save_model(


## Predicciones

In [25]:
def procesar_test(df_new_raw, user_encoder, movie_encoder, scaler, reducer):
    import holidays
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder

    df = df_new_raw.copy()

    # --- Codificación segura para userId y movieId ---
    known_users = set(user_encoder.classes_)
    known_movies = set(movie_encoder.classes_)

    df['userId'] = df['userId'].apply(lambda x: x if x in known_users else -1)
    df['movieId'] = df['movieId'].apply(lambda x: x if x in known_movies else -1)

    if -1 not in user_encoder.classes_:
        user_encoder.classes_ = np.append(user_encoder.classes_, -1)
    if -1 not in movie_encoder.classes_:
        movie_encoder.classes_ = np.append(movie_encoder.classes_, -1)

    df['user_id_enc'] = user_encoder.transform(df['userId'])
    df['movie_id_enc'] = movie_encoder.transform(df['movieId'])

    # --- Temporal features ---
    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = df['weekday'].isin([5,6]).astype(int)

    def get_season(month):
        if month in [12, 1, 2]: return 'invierno'
        elif month in [3, 4, 5]: return 'primavera'
        elif month in [6, 7, 8]: return 'verano'
        else: return 'otoño'

    df['season'] = df['month'].apply(get_season)
    df['season_encoded'] = df['season'].map({'invierno':0, 'primavera':1, 'verano':2, 'otoño':3})
    df['is_holiday'] = df['date'].isin(holidays.Ecuador(years=range(df['year'].min(), df['year'].max()+1))).astype(int)

    # --- Consumo semanal ---
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
    consumo = df.groupby(['userId', 'week_of_year']).size().reset_index(name='consumo_semanal_usuario')
    df = pd.merge(df, consumo, on=['userId', 'week_of_year'], how='left')

    # --- UMAP features ---
    umap_input = scaler.transform(df[['year', 'month', 'weekday', 'season_encoded', 'is_weekend', 'is_holiday']])
    umap_result = reducer.transform(umap_input)
    df[['temporal_1', 'temporal_2', 'temporal_3']] = umap_result

    return df


In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import joblib
import holidays

# === Cargar modelos previos ===
user_encoder = joblib.load("user_encoder.pkl")
movie_encoder = joblib.load("movie_encoder.pkl")
scaler_umap = joblib.load("scaler_temporal2.pkl")
reducer = joblib.load("umap_model2.pkl")
scaler_feats = joblib.load("scaler_features_rnn.pkl")

# === Cargar test ===
df_new = pd.read_csv("testfinal.csv")

# === Codificación segura de IDs ===
known_users = set(user_encoder.classes_)
known_movies = set(movie_encoder.classes_)

df_new['userId'] = df_new['userId'].apply(lambda x: x if x in known_users else -1)
df_new['movieId'] = df_new['movieId'].apply(lambda x: x if x in known_movies else -1)

if -1 not in user_encoder.classes_:
    user_encoder.classes_ = np.append(user_encoder.classes_, -1)
if -1 not in movie_encoder.classes_:
    movie_encoder.classes_ = np.append(movie_encoder.classes_, -1)

df_new['user_id_enc'] = user_encoder.transform(df_new['userId'])
df_new['movie_id_enc'] = movie_encoder.transform(df_new['movieId'])

# === Features temporales ===
df_new['date'] = pd.to_datetime(df_new['timestamp'], unit='s')
df_new['year'] = df_new['date'].dt.year
df_new['month'] = df_new['date'].dt.month
df_new['weekday'] = df_new['date'].dt.weekday
df_new['is_weekend'] = df_new['weekday'].isin([5, 6]).astype(int)
df_new['week_of_year'] = df_new['date'].dt.isocalendar().week.astype(int)

df_new['season'] = df_new['month'].apply(
    lambda m: 'invierno' if m in [12, 1, 2] else
              'primavera' if m in [3, 4, 5] else
              'verano' if m in [6, 7, 8] else 'otoño'
)
df_new['season_encoded'] = df_new['season'].map({'invierno': 0, 'primavera': 1, 'verano': 2, 'otoño': 3})
df_new['is_holiday'] = df_new['date'].isin(holidays.Ecuador(
    years=range(df_new['year'].min(), df_new['year'].max() + 1)
)).astype(int)

# === Consumo semanal ===
consumo = df_new.groupby(['userId', 'week_of_year']).size().reset_index(name='consumo_semanal_usuario')
df_new = pd.merge(df_new, consumo, on=['userId', 'week_of_year'], how='left')

# === UMAP features ===
umap_input = scaler_umap.transform(df_new[['year', 'month', 'weekday', 'season_encoded', 'is_weekend', 'is_holiday']])
umap_result = reducer.transform(umap_input)
df_new[['temporal_1', 'temporal_2', 'temporal_3']] = umap_result

# === Interacciones y tiempo desde última interacción ===
df_new['interactions_user'] = df_new.groupby('userId').cumcount() + 1
df_new['interactions_movie'] = df_new.groupby('movieId').cumcount() + 1

df_new['last_timestamp'] = df_new.groupby('userId')['timestamp'].shift(1)
df_new['days_since_last_user_interaction'] = (df_new['timestamp'] - df_new['last_timestamp']) / (60 * 60 * 24)
df_new['days_since_last_user_interaction'] = df_new['days_since_last_user_interaction'].fillna(0)

# === Estandarizar variables ===
features_to_scale = [
    'year', 'month', 'weekday',
    'interactions_user', 'interactions_movie',
    'days_since_last_user_interaction',
    'consumo_semanal_usuario'
]
df_new[features_to_scale] = scaler_feats.transform(df_new[features_to_scale])



  df_new['is_holiday'] = df_new['date'].isin(holidays.Ecuador(


In [38]:
# --- Features finales para input ---
temporal_features = [
    'temporal_1', 'temporal_2', 'temporal_3',
    'is_weekend', 'season_encoded',
    'year', 'month', 'weekday',
    'consumo_semanal_usuario',
    'interactions_user', 'interactions_movie', 'days_since_last_user_interaction'
]

# --- Agrupar en secuencias ---
user_sequences = defaultdict(list)
movie_sequences = defaultdict(list)

for _, row in df_new.iterrows():
    uid = row['user_id_enc']
    mid = row['movie_id_enc']
    x = row[temporal_features].values.astype('float32')
    user_sequences[uid].append(x)
    movie_sequences[uid].append(mid)

# --- Padding ---
X_new = list(user_sequences.values())
M_new = list(movie_sequences.values())
U_new = np.array(list(user_sequences.keys()))

maxlen = 50
X_new_padded = pad_sequences(X_new, maxlen=maxlen, padding='pre', dtype='float32')
M_new_padded = pad_sequences(M_new, maxlen=maxlen, padding='pre')
U_new_padded = U_new.reshape(-1, 1)



In [39]:
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np

# === Cargar modelo ===
model = load_model("rnn_model_with_embeddings4.h5")

# === Predecir secuencia ===
y_pred_seq = model.predict([X_new_padded, U_new_padded, M_new_padded]).squeeze()

# === Obtener última predicción válida por usuario ===
y_pred_last = []
for seq_pred, seq_input in zip(y_pred_seq, X_new_padded):
    mask = np.any(seq_input != 0, axis=1)
    last_valid_idx = np.where(mask)[0][-1]
    y_pred_last.append(seq_pred[last_valid_idx])

# === Reconstruir userId de forma segura ===
# Validar que todos los valores de U_new estén dentro del rango válido del encoder
valid_range = len(user_encoder.classes_)
user_ids_decoded = []

for u in U_new.flatten():
    if u < valid_range:
        user_ids_decoded.append(user_encoder.inverse_transform([u])[0])
    else:
        user_ids_decoded.append("unknown")

# === Resultados finales ===
results = pd.DataFrame({
    'userId': user_ids_decoded,
    'predicted_last_rating': np.round(y_pred_last, 1)
})

# === Exportar o mostrar ===
print("✅ Predicciones realizadas:")
print(results.head(20))
results.to_csv("predicciones_rnn_embeddings.csv", index=False)


✅ Predicciones realizadas:
    userId  predicted_last_rating
0    50403                    3.7
1    32443                    4.6
2   188231                    3.3
3    54812                    4.1
4    32676                    3.8
5    89328                    1.2
6    19325                    1.1
7   150689                    1.7
8   115166                    1.8
9    99558                    3.6
10  105746                    1.8
11  173703                    4.5
12   28872                    3.6
13  179715                    2.5
14  177642                    4.5
15  179218                    1.3
16  151556                    4.1
17   31053                    4.9
18  120287                    4.7
19   49000                    4.4


In [40]:
results['predicted_last_rating'] = np.round(results['predicted_last_rating'] * 2) / 2

# Mostrar resultados
print("✅ Predicciones redondeadas:")
print(results.head(20))


✅ Predicciones redondeadas:
    userId  predicted_last_rating
0    50403                    3.5
1    32443                    4.5
2   188231                    3.5
3    54812                    4.0
4    32676                    4.0
5    89328                    1.0
6    19325                    1.0
7   150689                    1.5
8   115166                    2.0
9    99558                    3.5
10  105746                    2.0
11  173703                    4.5
12   28872                    3.5
13  179715                    2.5
14  177642                    4.5
15  179218                    1.5
16  151556                    4.0
17   31053                    5.0
18  120287                    4.5
19   49000                    4.5
