In [3]:
import urllib.request
import zipfile
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split


movielens_data = pd.read_csv('/kaggle/input/dataset/Dataset.csv')

user_item_matrix = movielens_data.pivot(index='user_id', columns='movieId', values='rating').fillna(0)


user_item_matrix_bin = (user_item_matrix > 3).astype(float)

latent_dim = 128  

generator = keras.Sequential([
    keras.Input(shape=(latent_dim,)),
    layers.Dense(128, activation="relu"),
    layers.Dense(128),
    layers.LeakyReLU(),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(user_item_matrix.shape[1], activation="sigmoid")  
], name="generator")

discriminator = keras.Sequential([
    keras.Input(shape=(user_item_matrix.shape[1],)),
    layers.Dense(128, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")  
], name="discriminator")

class GAN(keras.Model):
    def __init__(self, discriminator, generator, latent_dim):
        super().__init__()
        self.discriminator = discriminator
        self.generator = generator
        self.latent_dim = latent_dim
        self.d_loss_metric = keras.metrics.Mean(name="d_loss")
        self.g_loss_metric = keras.metrics.Mean(name="g_loss")
    
    def compile(self, d_optimizer, g_optimizer, loss_fn):
        super(GAN, self).compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.loss_fn = loss_fn
    
    @property
    def metrics(self):
        return [self.d_loss_metric, self.g_loss_metric]

    def train_step(self, real_matrix):
        batch_size = tf.shape(real_matrix)[0]
        
        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
        generated_matrix = self.generator(random_latent_vectors)

        combined_matrix = tf.concat([generated_matrix, real_matrix], axis=0)
        labels = tf.concat([tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0)  
        labels += 0.05 * tf.random.uniform(tf.shape(labels))

        with tf.GradientTape() as tape:
            predictions = self.discriminator(combined_matrix)
            d_loss = self.loss_fn(labels, predictions)
        
        grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
        self.d_optimizer.apply_gradients(zip(grads, self.discriminator.trainable_weights))
        
        misleading_labels = tf.zeros((batch_size, 1))
        with tf.GradientTape() as tape:
            predictions = self.discriminator(self.generator(random_latent_vectors))
            g_loss = self.loss_fn(misleading_labels, predictions)
        
        grads = tape.gradient(g_loss, self.generator.trainable_weights)
        self.g_optimizer.apply_gradients(zip(grads, self.generator.trainable_weights))

        self.d_loss_metric.update_state(d_loss)
        self.g_loss_metric.update_state(g_loss)
        return {"d_loss": self.d_loss_metric.result(), "g_loss": self.g_loss_metric.result()}

gan = GAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
gan.compile(
    d_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    g_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss_fn=keras.losses.BinaryCrossentropy()
)

train_data, test_data = train_test_split(user_item_matrix_bin, test_size=0.2, random_state=42)


epochs = 100  
batch_size = 32

num_samples = train_data.shape[0]
for epoch in range(epochs):
    d_loss_epoch = 0
    g_loss_epoch = 0
    batch_count = 0
    
    for i in range(0, num_samples, batch_size):
        real_matrix_batch = train_data[i:i + batch_size]
        metrics = gan.train_step(real_matrix_batch)
        d_loss_epoch += metrics['d_loss']
        g_loss_epoch += metrics['g_loss']
        batch_count += 1

    d_loss_avg = d_loss_epoch / batch_count
    g_loss_avg = g_loss_epoch / batch_count
    
    print(f"Epoch {epoch+1}/{epochs} completed. Discriminator Loss: {d_loss_avg:.4f}, Generator Loss: {g_loss_avg:.4f}")

def evaluate_gan(generator, real_data, latent_dim):
 
    num_samples = real_data.shape[0]
    latent_vectors = tf.random.normal(shape=(num_samples, latent_dim))
    generated_ratings = generator(latent_vectors).numpy()
    real_ratings_flat = real_data.values.flatten()
    generated_ratings_flat = generated_ratings.flatten()
    mse = mean_squared_error(real_ratings_flat, generated_ratings_flat)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(real_ratings_flat, generated_ratings_flat)

    return {"MSE": mse, "RMSE": rmse, "MAE": mae}

evaluation_results = evaluate_gan(generator, test_data, latent_dim)
print(f"Evaluation Results: MSE = {evaluation_results['MSE']:.4f}, RMSE = {evaluation_results['RMSE']:.4f}, MAE = {evaluation_results['MAE']:.4f}")


Epoch 1/100 completed. Discriminator Loss: 0.4342, Generator Loss: 2.4106
Epoch 2/100 completed. Discriminator Loss: 0.3077, Generator Loss: 3.8353
Epoch 3/100 completed. Discriminator Loss: 0.2431, Generator Loss: 4.4933
Epoch 4/100 completed. Discriminator Loss: 0.2057, Generator Loss: 4.6861
Epoch 5/100 completed. Discriminator Loss: 0.1822, Generator Loss: 4.6642
Epoch 6/100 completed. Discriminator Loss: 0.1611, Generator Loss: 4.7831
Epoch 7/100 completed. Discriminator Loss: 0.1411, Generator Loss: 5.0418
Epoch 8/100 completed. Discriminator Loss: 0.1241, Generator Loss: 5.3426
Epoch 9/100 completed. Discriminator Loss: 0.1089, Generator Loss: 5.6584
Epoch 10/100 completed. Discriminator Loss: 0.0950, Generator Loss: 6.0107
Epoch 11/100 completed. Discriminator Loss: 0.0822, Generator Loss: 6.4063
Epoch 12/100 completed. Discriminator Loss: 0.0696, Generator Loss: 6.8486
Epoch 13/100 completed. Discriminator Loss: 0.0574, Generator Loss: 7.3217
Epoch 14/100 completed. Discrimina

In [7]:
import os
import pickle

save_dir = '/kaggle/working/'  
os.makedirs(save_dir, exist_ok=True)

generator_model_path = os.path.join(save_dir, 'gen_model.pkl')
discriminator_model_path = os.path.join(save_dir, 'disc_model.pkl')
gan_model_path = os.path.join(save_dir, 'gan_model.pkl')

with open(generator_model_path, 'wb') as f:
    pickle.dump(generator, f)
    
with open(discriminator_model_path, 'wb') as f:
    pickle.dump(discriminator, f)

with open(gan_model_path, 'wb') as f:
    pickle.dump(gan, f)

print(f"Models saved at {save_dir}")

Models saved at /kaggle/working/


In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


num_users, num_items = user_item_matrix.shape

latent_dim = 128

encoder_input = keras.Input(shape=(num_items,))
encoded = layers.Dense(256, activation="relu")(encoder_input)
encoded = layers.Dense(128, activation="relu")(encoded)
encoded = layers.Dense(latent_dim, activation="relu")(encoded)

encoder = keras.Model(encoder_input, encoded, name="encoder")

decoder_input = keras.Input(shape=(latent_dim,))
decoded = layers.Dense(128, activation="relu")(decoder_input)
decoded = layers.Dense(256, activation="relu")(decoded)
decoded = layers.Dense(num_items, activation="sigmoid")(decoded)

decoder = keras.Model(decoder_input, decoded, name="decoder")

autoencoder_input = keras.Input(shape=(num_items,))
encoded_output = encoder(autoencoder_input)
decoded_output = decoder(encoded_output)

autoencoder = keras.Model(autoencoder_input, decoded_output, name="autoencoder")
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(train_data, train_data, epochs=50, batch_size=32, validation_data=(test_data, test_data))

latent_vectors = encoder.predict(user_item_matrix_bin)

np.save('latent_vectors.npy', latent_vectors)



Epoch 1/50


I0000 00:00:1728854649.331936     179 service.cc:145] XLA service 0x7a442000e020 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728854649.331979     179 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1728854649.331983     179 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m 1/24[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:06[0m 3s/step - loss: 0.2500

I0000 00:00:1728854650.712876     179 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step - loss: 0.1841 - val_loss: 0.0326
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0345 - val_loss: 0.0330
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0371 - val_loss: 0.0330
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0347 - val_loss: 0.0331
Epoch 5/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0352 - val_loss: 0.0330
Epoch 6/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0348 - val_loss: 0.0331
Epoch 7/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0350 - val_loss: 0.0330
Epoch 8/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0354 - val_loss: 0.0331
Epoch 9/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import pickle

latent_vectors = np.load('latent_vectors.npy')

movies_data = pd.read_csv('/kaggle/input/dataset/Movie_Id_Titles.csv')

movie_id_to_name = pd.Series(movies_data['title'].values, index=movies_data['item_id']).to_dict()

latent_dim = latent_vectors.shape[1]  
num_movies = len(movie_id_to_name)

with open("/kaggle/working/gen_model.pkl", "rb") as file:
    generation = pickle.load(file)
    
def generate_movie_recommendations(user_id, latent_vectors, top_n=10):
  
    user_latent_vector = latent_vectors[user_id - 1]  
    generated_ratings = generation(np.expand_dims(user_latent_vector, axis=0)).numpy().flatten()
    top_movie_indices = np.argsort(generated_ratings)[::-1][:top_n]
    recommended_movie_ids = movie_id_to_name.keys()
    recommended_movie_names = [movie_id_to_name[list(recommended_movie_ids)[i]] for i in top_movie_indices]
    
    return recommended_movie_names

user_id = 5  
recommended_movies = generate_movie_recommendations(user_id, latent_vectors, top_n=10)

print(f"Recommended Movies for User {user_id}:")
for i, movie in enumerate(recommended_movies, 1):
    print(f"{i}. {movie}")


Recommended Movies for User 5:
1. E.T. the Extra-Terrestrial (1982)
2. Return of the Jedi (1983)
3. Fargo (1996)
4. Shawshank Redemption, The (1994)
5. My Left Foot (1989)
6. Star Wars (1977)
7. Hudsucker Proxy, The (1994)
8. Psycho (1960)
9. Toy Story (1995)
10. Ransom (1996)


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import pickle

num_cols = ['rating']
categ_cols = ['item_id']  


categ_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))
])

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

all_pipeline = ColumnTransformer(transformers=[
    ('numerical', num_pipeline, num_cols),
    ('categorical', categ_pipeline, categ_cols)
])

movielens_data = pd.read_csv('/kaggle/input/dataset/Dataset.csv')

train_data, test_data = train_test_split(movielens_data, test_size=0.2, random_state=42)

X_train_final = all_pipeline.fit_transform(train_data)
X_test_final = all_pipeline.transform(test_data)

output_cols = num_cols + all_pipeline.named_transformers_['categorical'].named_steps['ohe'].get_feature_names_out(categ_cols).tolist()
pd.DataFrame(X_train_final, columns=output_cols)

save_dir = os.getcwd()
pipeline_path = os.path.join(save_dir, 'all_pipeline.pkl')
with open(pipeline_path, 'wb') as f:
    pickle.dump(all_pipeline, f)

print(f"Pipeline saved successfully at {pipeline_path}")




Pipeline saved successfully at /kaggle/working/all_pipeline.pkl
