In [1]:
import os
import math
import copy
from itertools import zip_longest

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim

In [2]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)

In [3]:
RANDOM_STATE = 42
set_random_seed(RANDOM_STATE)

In [4]:
DATASET_LINK='https://files.grouplens.org/datasets/movielens/ml-32m.zip'

In [5]:
import urllib.request
import zipfile

zip_filename = DATASET_LINK.split('/')[-1]
'''This script downloads the MovieLens 32M dataset if it is not already present, and extracts it
from the zip file. The dataset is used for building recommendation systems.

# Download the dataset if not already present
if not os.path.exists(zip_filename):
	urllib.request.urlretrieve(DATASET_LINK, zip_filename)

# Unzip the file if not already extracted
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
	zip_ref.extractall()'''

"This script downloads the MovieLens 32M dataset if it is not already present, and extracts it\nfrom the zip file. The dataset is used for building recommendation systems.\n\n# Download the dataset if not already present\nif not os.path.exists(zip_filename):\n\turllib.request.urlretrieve(DATASET_LINK, zip_filename)\n\n# Unzip the file if not already extracted\nwith zipfile.ZipFile(zip_filename, 'r') as zip_ref:\n\tzip_ref.extractall()"

In [6]:
def read_data(path):
    files = {}
    for filename in os.listdir(path):
        stem, suffix =  os.path.splitext(filename)
        file_path = os.path.join(path,filename)
        print(filename)
        if suffix == '.csv':
            files[stem] = pd.read_csv(file_path)
        elif suffix == '.dat':
            if stem == 'ratings':
                columns = ['userId', 'movieId', 'rating', 'timestamp']
            else:
                columns = ['movieId', 'title', 'genres']
            data = pd.read_csv(file_path, sep='::', names=columns, engine='python')
            files[stem] = data
    return files['ratings'], files['movies']

In [7]:
ratings, movies = read_data('ml-32m')

checksums.txt
links.csv
movies.csv
ratings.csv
README.txt
tags.csv


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [9]:
minmax = ratings.rating.min(), ratings.rating.max()
print(f"Rating range: {minmax}")

Rating range: (0.5, 5.0)


In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
movies.drop_duplicates().shape

(87585, 3)

In [12]:
ratings = ratings.merge(movies[["movieId", "title"]], on="movieId")

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,17,4.0,944249077,Sense and Sensibility (1995)
1,3,17,5.0,1084485217,Sense and Sensibility (1995)
2,15,17,4.5,1289858271,Sense and Sensibility (1995)
3,28,17,4.0,961513829,Sense and Sensibility (1995)
4,29,17,4.0,845056111,Sense and Sensibility (1995)


In [14]:
def tabular_preview(ratings, n=15):
    """Creates a cross-tabular view of users vs movies."""
    
    user_groups = ratings.groupby('userId')['rating'].count()
    top_users = user_groups.sort_values(ascending=False)[:n]

    movie_groups = ratings.groupby('movieId')['rating'].count()
    top_movies = movie_groups.sort_values(ascending=False)[:n]

    top = (
        ratings.
        join(top_users, rsuffix='_r', how='inner', on='userId').
        join(top_movies, rsuffix='_r', how='inner', on='movieId'))

    return pd.crosstab(top.userId, top.movieId, top.rating, aggfunc=np.sum)

In [15]:
tabular_preview(ratings, n=15)

movieId,1,50,110,260,296,318,356,480,527,589,593,1196,2571,2959,4993
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7858,5.0,5.0,3.5,4.5,5.0,5.0,2.0,3.0,5.0,3.5,2.5,5.0,5.0,4.0,5.0
10202,5.0,5.0,1.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0
14674,3.0,4.5,5.0,5.0,2.0,2.5,5.0,2.0,3.5,5.0,3.0,5.0,4.0,3.5,5.0
17035,3.5,5.0,4.0,0.5,5.0,5.0,3.5,3.0,4.5,3.5,5.0,0.5,1.5,4.5,3.0
22744,5.0,4.0,4.0,5.0,5.0,3.0,4.0,5.0,0.5,5.0,5.0,5.0,5.0,5.0,5.0
49305,5.0,4.5,4.0,5.0,5.0,4.5,4.5,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.5
53192,,4.0,3.5,5.0,3.5,4.5,4.0,3.0,4.0,3.0,4.0,4.5,2.5,3.0,3.0
55653,4.0,4.5,5.0,4.0,4.5,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0
57304,4.0,3.0,,2.5,4.0,2.5,,2.0,5.0,3.0,4.0,3.0,4.0,3.5,3.0
123465,5.0,2.5,2.0,3.5,5.0,5.0,4.0,2.0,4.5,3.0,4.0,4.5,2.0,4.5,5.0


In [16]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('userId')['rating'].count()
    
    unique_users = ratings.userId.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = ratings.userId.map(user_to_index)
    
    unique_movies = ratings.movieId.unique()
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    new_movies = ratings.movieId.map(movie_to_index)
        
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users, 'movie_id': new_movies})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [17]:
(n, m), (X, y), (user_to_index, movie_to_index) = create_dataset(ratings)
print(f'Embeddings: {n} users, {m} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 200948 users, 84432 movies
Dataset shape: (32000204, 2)
Target shape: (32000204,)


# Creating the Dataloader

In [18]:
class ReviewsIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

In [19]:
def batches(X, y, bs=32, shuffle=True):
    for xb, yb in ReviewsIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1) 

In [None]:
for x_batch, y_batch in batches(X, y, bs=4):
    print(x_batch)
    print(y_batch)
    break

tensor([[42214,   556],
        [30845,    65],
        [60687,   292],
        [ 2000,  1206]])
tensor([[3.5000],
        [4.0000],
        [5.0000],
        [2.0000]])


In [21]:
# Time-based split: 20% newest data for ETL pipeline testing
timestamp_threshold = ratings['timestamp'].quantile(0.8)
print(f"Timestamp threshold (80th percentile): {timestamp_threshold}")

# Split by timestamp
older_data_mask = ratings['timestamp'] <= timestamp_threshold
newer_data_mask = ratings['timestamp'] > timestamp_threshold

# Get indices for X and y
older_indices = X.index[older_data_mask]
newer_indices = X.index[newer_data_mask]

# Create newer dataset (20% most recent)
X_newer = X.loc[newer_indices].reset_index(drop=True)
y_newer = y.loc[newer_indices].reset_index(drop=True)

# Create older dataset (80% older data)
X_older = X.loc[older_indices].reset_index(drop=True)
y_older = y.loc[older_indices].reset_index(drop=True)

print(f"Newer data (for ETL pipeline): {len(X_newer):,} samples")
print(f"Older data (for train/val split): {len(X_older):,} samples")

# Now do normal train/test split on the older data
X_train, X_valid, y_train, y_valid = train_test_split(
   X_older, y_older, test_size=0.2, random_state=RANDOM_STATE
)

# Organize datasets
datasets = {
   'train': (X_train, y_train),
   'val': (X_valid, y_valid),
   'newer': (X_newer, y_newer)  # For ETL pipeline testing
}

dataset_sizes = {
'train': len(X_train),
   'val': len(X_valid),
   'newer': len(X_newer)
}

print(f"\nFinal split sizes:")
print(f"Training: {dataset_sizes['train']:,}")
print(f"Validation: {dataset_sizes['val']:,}")
print(f"Newer (ETL): {dataset_sizes['newer']:,}")
print(f"Total: {sum(dataset_sizes.values()):,}")

Timestamp threshold (80th percentile): 1538551302.0
Newer data (for ETL pipeline): 6,400,041 samples
Older data (for train/val split): 25,600,163 samples

Final split sizes:
Training: 20,480,130
Validation: 5,120,033
Newer (ETL): 6,400,041
Total: 32,000,204


# Export Training Data

In [22]:
# Export ONLY the training datasets to CSV files for cloud training       
# This ensures NO data leakage from the newer (ETL test) dataset

print("Exporting training datasets for cloud deployment...")

# Export training set
train_df = pd.DataFrame({
    'user_id': X_train.iloc[:, 0],
    'movie_id': X_train.iloc[:, 1],
    'rating': y_train
})

# Export validation set
val_df = pd.DataFrame({
    'user_id': X_valid.iloc[:, 0],
    'movie_id': X_valid.iloc[:, 1],
    'rating': y_valid
})

# Export to CSV files
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)

print(f"✅ Training data exported: {len(train_df):,} samples")
print(f"✅ Validation data exported: {len(val_df):,} samples")
print(f"🚨 Newer data (ETL test) kept separate: {len(X_newer):,} samples")

# Also save the mappings for later use
import pickle

mappings = {
   'user_to_index': user_to_index,
   'movie_to_index': movie_to_index,
   'n_users': n,
   'n_movies': m,
   'minmax': minmax
}

with open('data_mappings.pkl', 'wb') as f:
    pickle.dump(mappings, f)

print("✅ Data mappings saved for inference")
print("\n📁 Files ready for upload to cloud:")
print("- train_data.csv")
print("- val_data.csv")
print("- data_mappings.pkl")
print("- cloud_training.py")
print("- requirements_cloud.txt")
print("- setup_runpod.sh")

Exporting training datasets for cloud deployment...
✅ Training data exported: 20,480,130 samples
✅ Validation data exported: 5,120,033 samples
🚨 Newer data (ETL test) kept separate: 6,400,041 samples
✅ Data mappings saved for inference

📁 Files ready for upload to cloud:
- train_data.csv
- val_data.csv
- data_mappings.pkl
- cloud_training.py
- requirements_cloud.txt
- setup_runpod.sh
