In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from codecarbon import EmissionsTracker

In [3]:
# Load MovieLens dataset
movies_df = pd.read_csv('C://Users//xpati//Documents//TFG//ml-latest-small//movies.csv')
ratings_df = pd.read_csv('C://Users//xpati//Documents//TFG//ml-latest-small//ratings.csv')

# Display first few rows
print(movies_df.head())
print(ratings_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [4]:
# Label encoding of user and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_df['userId'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['movieId'] = movie_encoder.fit_transform(ratings_df['movieId'])

# Get number of unique users and movies
n_users = len(user_encoder.classes_)
n_movies = len(movie_encoder.classes_)

print(f'Number of users: {n_users}')
print(f'Number of movies: {n_movies}')

Number of users: 610
Number of movies: 9724


In [5]:
class MovieLensDataset(Dataset):
    def __init__(self, ratings_df, n_users, n_movies):
        self.users = ratings_df['userId'].values
        self.movies = ratings_df['movieId'].values
        self.ratings = ratings_df['rating'].values

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return torch.tensor(self.users[idx], dtype=torch.long), \
               torch.tensor(self.movies[idx], dtype=torch.long), \
               torch.tensor(self.ratings[idx], dtype=torch.float32)

In [6]:
class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, n_users, n_movies, embedding_dim=50, hidden_dim=128):
        super(NeuralCollaborativeFiltering, self).__init__()
        
        # Embedding layers for users and movies
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)
        
        # MLP layers for neural network
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, user, movie):
        # Get embeddings
        user_embedded = self.user_embedding(user)
        movie_embedded = self.movie_embedding(movie)
        
        # Concatenate embeddings
        x = torch.cat([user_embedded, movie_embedded], dim=1)
        
        # Pass through MLP
        return self.mlp(x).squeeze()

In [7]:
# Split data into training and validation sets
train_df, val_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Create DataLoader for train and validation
train_dataset = MovieLensDataset(train_df, n_users, n_movies)
val_dataset = MovieLensDataset(val_df, n_users, n_movies)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [8]:
# Initialize the model
model = NeuralCollaborativeFiltering(n_users, n_movies, embedding_dim=50, hidden_dim=128)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error for regression task
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Initialize the CodeCarbon tracker
tracker = EmissionsTracker()

# Training loop
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    
    # Start tracking emissions for this epoch
    tracker.start()
    
    for user, movie, rating in train_loader:
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(user, movie)
        
        # Compute loss
        loss = criterion(output, rating)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Stop tracking emissions for this epoch
    emissions = tracker.stop()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{n_epochs}, Training Loss: {avg_train_loss:.4f}')
    print(f'Emissions for Epoch {epoch + 1}: {emissions:.4f} kg CO2')

# Final emissions report
total_emissions = tracker.final_emissions
print(f'Total emissions during training: {total_emissions:.4f} kg CO2')

[codecarbon INFO @ 17:13:05] [setup] RAM Tracking...
[codecarbon INFO @ 17:13:05] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU

[codecarbon INFO @ 17:13:07] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
[codecarbon INFO @ 17:13:07] [setup] GPU Tracking...
[codecarbon INFO @ 17:13:07] No GPU found.
[codecarbon INFO @ 17:13:07] >>> Tracker's metadata:
[codecarbon INFO @ 17:13:07]   Platform system: Windows-10-10.0.26100-SP0
[codecarbon INFO @ 17:13:07]   Python version: 3.8.8
[codecarbon INFO @ 17:13:07]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 17:13:07]   Available RAM : 15.747 GB
[codecarbon INFO @ 17:13:07]   CPU count: 8
[codecarbon INFO @ 17:13:07]   CPU model: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
[codecarbon INFO @ 17:13:07]   GPU count: None
[codecarbon INFO @ 17:13:07]   GPU model: None
[codecarbon INFO @ 17:13:09] Saving emissions data to file C:\Users\xpati\Documents\TFG\TFG_GCED\Mo

Epoch 1/10, Training Loss: 1.2742
Emissions for Epoch 1: 0.0000 kg CO2


[codecarbon INFO @ 17:13:25] Energy consumed for RAM : 0.000027 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:13:25] Energy consumed for all CPUs : 0.000057 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:13:25] 0.000083 kWh of electricity used since the beginning.


Epoch 2/10, Training Loss: 0.8231
Emissions for Epoch 2: 0.0000 kg CO2


[codecarbon INFO @ 17:13:33] Energy consumed for RAM : 0.000040 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:13:33] Energy consumed for all CPUs : 0.000085 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:13:33] 0.000125 kWh of electricity used since the beginning.


Epoch 3/10, Training Loss: 0.7546
Emissions for Epoch 3: 0.0000 kg CO2


[codecarbon INFO @ 17:13:41] Energy consumed for RAM : 0.000053 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:13:41] Energy consumed for all CPUs : 0.000112 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:13:41] 0.000165 kWh of electricity used since the beginning.


Epoch 4/10, Training Loss: 0.7067
Emissions for Epoch 4: 0.0000 kg CO2


[codecarbon INFO @ 17:13:49] Energy consumed for RAM : 0.000065 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:13:49] Energy consumed for all CPUs : 0.000138 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:13:49] 0.000204 kWh of electricity used since the beginning.


Epoch 5/10, Training Loss: 0.6697
Emissions for Epoch 5: 0.0000 kg CO2


[codecarbon INFO @ 17:13:56] Energy consumed for RAM : 0.000077 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:13:56] Energy consumed for all CPUs : 0.000164 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:13:56] 0.000241 kWh of electricity used since the beginning.


Epoch 6/10, Training Loss: 0.6391
Emissions for Epoch 6: 0.0000 kg CO2


[codecarbon INFO @ 17:14:03] Energy consumed for RAM : 0.000090 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:14:03] Energy consumed for all CPUs : 0.000190 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:14:03] 0.000279 kWh of electricity used since the beginning.


Epoch 7/10, Training Loss: 0.6128
Emissions for Epoch 7: 0.0000 kg CO2


[codecarbon INFO @ 17:14:11] Energy consumed for RAM : 0.000101 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:14:11] Energy consumed for all CPUs : 0.000215 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:14:11] 0.000316 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:14:11] 0.000890 g.CO2eq/s mean an estimation of 28.05540132477771 kg.CO2eq/year


Epoch 8/10, Training Loss: 0.5880
Emissions for Epoch 8: 0.0001 kg CO2


[codecarbon INFO @ 17:14:18] Energy consumed for RAM : 0.000113 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:14:18] Energy consumed for all CPUs : 0.000240 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:14:18] 0.000353 kWh of electricity used since the beginning.


Epoch 9/10, Training Loss: 0.5650
Emissions for Epoch 9: 0.0001 kg CO2


[codecarbon INFO @ 17:14:25] Energy consumed for RAM : 0.000125 kWh. RAM Power : 5.905032634735107 W
[codecarbon INFO @ 17:14:25] Energy consumed for all CPUs : 0.000264 kWh. Total CPU Power : 12.5 W
[codecarbon INFO @ 17:14:25] 0.000389 kWh of electricity used since the beginning.


Epoch 10/10, Training Loss: 0.5432
Emissions for Epoch 10: 0.0001 kg CO2
Total emissions during training: 0.0001 kg CO2


In [9]:
# Evaluate the model on the validation set
model.eval()
total_val_loss = 0

with torch.no_grad():
    for user, movie, rating in val_loader:
        output = model(user, movie)
        loss = criterion(output, rating)
        total_val_loss += loss.item()

avg_val_loss = total_val_loss / len(val_loader)
print(f'Validation Loss: {avg_val_loss:.4f}')

Validation Loss: 0.8480


In [10]:
import numpy as np

# Function to calculate RMSE and MAE
def evaluate_metrics(model, val_loader):
    model.eval()
    total_squared_error = 0
    total_absolute_error = 0
    total_samples = 0
    
    with torch.no_grad():
        for user, movie, rating in val_loader:
            # Forward pass
            output = model(user, movie)
            
            # Calculate squared error and absolute error
            squared_error = (output - rating) ** 2
            absolute_error = torch.abs(output - rating)
            
            # Accumulate total errors
            total_squared_error += squared_error.sum().item()
            total_absolute_error += absolute_error.sum().item()
            total_samples += len(rating)
    
    # Compute RMSE and MAE
    rmse = np.sqrt(total_squared_error / total_samples)
    mae = total_absolute_error / total_samples
    
    return rmse, mae

# Evaluate RMSE and MAE on validation data
rmse, mae = evaluate_metrics(model, val_loader)
print(f'Validation RMSE: {rmse:.4f}')
print(f'Validation MAE: {mae:.4f}')

Validation RMSE: 0.9216
Validation MAE: 0.7060
