In [1]:
import os
import math
import copy
from itertools import zip_longest

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim

In [3]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)

In [4]:
RANDOM_STATE = 42
set_random_seed(RANDOM_STATE)

In [5]:
DATASET_LINK='https://files.grouplens.org/datasets/movielens/ml-32m.zip'

In [11]:
import urllib.request
import zipfile


zip_filename = DATASET_LINK.split('/')[-1]
'''This script downloads the MovieLens 32M dataset if it is not already present, and extracts it
from the zip file. The dataset is used for building recommendation systems.'''

# Download the dataset if not already present
if not os.path.exists(zip_filename):
	import ssl
	context = ssl._create_unverified_context()
	urllib.request.urlretrieve(DATASET_LINK, zip_filename)

# Unzip the file if not already extracted
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
	zip_ref.extractall()

In [14]:
def read_data(path):
    files = {}
    for filename in os.listdir(path):
        stem, suffix =  os.path.splitext(filename)
        file_path = os.path.join(path,filename)
        print(filename)
        if suffix == '.csv':
            files[stem] = pd.read_csv(file_path)
        elif suffix == '.dat':
            if stem == 'ratings':
                columns = ['userId', 'movieId', 'rating', 'timestamp']
            else:
                columns = ['movieId', 'title', 'genres']
            data = pd.read_csv(file_path, sep='::', names=columns, engine='python')
            files[stem] = data
    return files['ratings'], files['movies']

In [15]:
# Use existing data in data/raw directory (skip download/extraction)
extracted_dir = '/Users/nolanrobbins/Desktop/VS Code Projects/MovieLens-RecSys/data/raw'
print(f"Using existing data from: {extracted_dir}")

# Read data from the raw directory
ratings, movies = read_data(extracted_dir)

Using existing data from: /Users/nolanrobbins/Desktop/VS Code Projects/MovieLens-RecSys/data/raw


FileNotFoundError: [WinError 3] The system cannot find the path specified: '/Users/nolanrobbins/Desktop/VS Code Projects/MovieLens-RecSys/data/raw'

In [42]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [43]:
minmax = ratings.rating.min(), ratings.rating.max()
print(f"Rating range: {minmax}")

Rating range: (np.float64(0.5), np.float64(5.0))


In [44]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
movies.drop_duplicates().shape

(87585, 3)

In [46]:
ratings = ratings.merge(movies[["movieId", "title"]], on="movieId")

In [47]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,17,4.0,944249077,Sense and Sensibility (1995)
1,1,25,1.0,944250228,Leaving Las Vegas (1995)
2,1,29,2.0,943230976,"City of Lost Children, The (Cité des enfants p..."
3,1,30,5.0,944249077,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
4,1,32,5.0,943228858,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)


In [48]:
def tabular_preview(ratings, n=15):
    """Creates a cross-tabular view of users vs movies."""
    
    user_groups = ratings.groupby('userId')['rating'].count()
    top_users = user_groups.sort_values(ascending=False)[:n]

    movie_groups = ratings.groupby('movieId')['rating'].count()
    top_movies = movie_groups.sort_values(ascending=False)[:n]

    top = (
        ratings.
        join(top_users, rsuffix='_r', how='inner', on='userId').
        join(top_movies, rsuffix='_r', how='inner', on='movieId'))

    return pd.crosstab(top.userId, top.movieId, top.rating, aggfunc=np.sum)

In [49]:
tabular_preview(ratings, n=15)

  return pd.crosstab(top.userId, top.movieId, top.rating, aggfunc=np.sum)


movieId,1,50,110,260,296,318,356,480,527,589,593,1196,2571,2959,4993
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7858,5.0,5.0,3.5,4.5,5.0,5.0,2.0,3.0,5.0,3.5,2.5,5.0,5.0,4.0,5.0
10202,5.0,5.0,1.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0
14674,3.0,4.5,5.0,5.0,2.0,2.5,5.0,2.0,3.5,5.0,3.0,5.0,4.0,3.5,5.0
17035,3.5,5.0,4.0,0.5,5.0,5.0,3.5,3.0,4.5,3.5,5.0,0.5,1.5,4.5,3.0
22744,5.0,4.0,4.0,5.0,5.0,3.0,4.0,5.0,0.5,5.0,5.0,5.0,5.0,5.0,5.0
49305,5.0,4.5,4.0,5.0,5.0,4.5,4.5,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.5
53192,,4.0,3.5,5.0,3.5,4.5,4.0,3.0,4.0,3.0,4.0,4.5,2.5,3.0,3.0
55653,4.0,4.5,5.0,4.0,4.5,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0
57304,4.0,3.0,,2.5,4.0,2.5,,2.0,5.0,3.0,4.0,3.0,4.0,3.5,3.0
123465,5.0,2.5,2.0,3.5,5.0,5.0,4.0,2.0,4.5,3.0,4.0,4.5,2.0,4.5,5.0


In [50]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('userId')['rating'].count()
    
    unique_users = ratings.userId.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = ratings.userId.map(user_to_index)
    
    unique_movies = ratings.movieId.unique()
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    new_movies = ratings.movieId.map(movie_to_index)
        
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users, 'movie_id': new_movies})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [51]:
(n, m), (X, y), (user_to_index, movie_to_index) = create_dataset(ratings)
print(f'Embeddings: {n} users, {m} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 200948 users, 84432 movies
Dataset shape: (32000204, 2)
Target shape: (32000204,)


# Creating the Dataloader

In [52]:
class ReviewsIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

In [53]:
def batches(X, y, bs=32, shuffle=True):
    for xb, yb in ReviewsIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1) 

In [54]:
for x_batch, y_batch in batches(X, y, bs=4):
    print(x_batch)
    print(y_batch)
    break

tensor([[ 66953,   5323],
        [  9876,   1582],
        [ 38347,    968],
        [101951,    517]])
tensor([[5.],
        [4.],
        [2.],
        [1.]])


In [None]:
# Load data from movielens_past.inter for training (80% chronological past data)
print("Loading training data from movielens_past.inter...")

# Check if the movielens_past.inter file exists, if not use original approach
past_file_path = 'data/processed/movielens_past.inter'
if os.path.exists(past_file_path):
    # Load from the chronologically split past data
    past_ratings_df = pd.read_csv(past_file_path, sep='\t')
    print(f"Loaded {len(past_ratings_df):,} past interactions from {past_file_path}")
    
    # Now split the past data for train/val (80/20 of the past data)
    X_past = pd.DataFrame({
        'user_id': past_ratings_df['userId'].map(user_to_index), 
        'movie_id': past_ratings_df['movieId'].map(movie_to_index)
    })
    y_past = past_ratings_df['rating'].astype(np.float32)
    
    # Split past data into train/validation
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_past, y_past, test_size=0.2, random_state=RANDOM_STATE
    )
    
    # Load future data for ETL pipeline testing
    future_file_path = 'data/processed/movielens_future.inter'
    if os.path.exists(future_file_path):
        future_ratings_df = pd.read_csv(future_file_path, sep='\t')
        X_future = pd.DataFrame({
            'user_id': future_ratings_df['userId'].map(user_to_index),
            'movie_id': future_ratings_df['movieId'].map(movie_to_index)
        })
        y_future = future_ratings_df['rating'].astype(np.float32)
        
        print(f"Loaded {len(future_ratings_df):,} future interactions for ETL pipeline")
    else:
        print("Warning: movielens_future.inter not found, using empty future dataset")
        X_future, y_future = pd.DataFrame(), pd.Series(dtype=np.float32)
    
else:
    # Fallback to original timestamp-based split approach
    print("movielens_past.inter not found, using original timestamp-based split...")
    
    # Time-based split: 20% newest data for ETL pipeline testing
    timestamp_threshold = ratings['timestamp'].quantile(0.8)
    print(f"Timestamp threshold (80th percentile): {timestamp_threshold}")

    # Split by timestamp
    older_data_mask = ratings['timestamp'] <= timestamp_threshold
    newer_data_mask = ratings['timestamp'] > timestamp_threshold

    # Get indices for X and y
    older_indices = X.index[older_data_mask]
    newer_indices = X.index[newer_data_mask]

    # Create newer dataset (20% most recent)
    X_future = X.loc[newer_indices].reset_index(drop=True)
    y_future = y.loc[newer_indices].reset_index(drop=True)

    # Create older dataset (80% older data)
    X_older = X.loc[older_indices].reset_index(drop=True)
    y_older = y.loc[older_indices].reset_index(drop=True)

    print(f"Future data (for ETL pipeline): {len(X_future):,} samples")
    print(f"Older data (for train/val split): {len(X_older):,} samples")

    # Now do normal train/test split on the older data
    X_train, X_valid, y_train, y_valid = train_test_split(
       X_older, y_older, test_size=0.2, random_state=RANDOM_STATE
    )

# Organize datasets
datasets = {
   'train': (X_train, y_train),
   'val': (X_valid, y_valid),
   'future': (X_future, y_future)  # For ETL pipeline testing
}

dataset_sizes = {
   'train': len(X_train),
   'val': len(X_valid),
   'future': len(X_future)
}

print(f"\nFinal split sizes:")
print(f"Training: {dataset_sizes['train']:,}")
print(f"Validation: {dataset_sizes['val']:,}")
print(f"Future (ETL): {dataset_sizes['future']:,}")
print(f"Total: {sum(dataset_sizes.values()):,}")

# Verify data loading
print(f"\nData verification:")
print(f"✅ Training data ready from chronologically split past data")
print(f"✅ Validation data split from training data")  
print(f"✅ Future data ready for ETL pipeline testing")
print(f"✅ Using movielens.inter format: userId, movieId, rating, timestamp")

# Export Training Data

In [None]:
# STEP 1: Generate missing data files for SS4Rec training
# First check what data we actually have and create the missing files

print("🚀 STEP 1: Generating missing data files for SS4Rec training...")

# Check what data files actually exist
print("🔍 Checking existing data files...")
data_files = {
    'raw_data': 'data/raw',
    'processed_train': 'data/processed/train_data.csv', 
    'processed_val': 'data/processed/val_data.csv',
    'recbole_inter': 'data/recbole_format/movielens/movielens.inter'
}

for name, path in data_files.items():
    exists = os.path.exists(path)
    status = "✅" if exists else "❌"
    print(f"{status} {name}: {path}")

# Check if we have raw MovieLens data
raw_data_dir = 'data/raw'
if os.path.exists(raw_data_dir):
    print(f"\n📁 Raw data directory exists: {raw_data_dir}")
    raw_files = os.listdir(raw_data_dir)
    print(f"📋 Raw files: {raw_files}")
    
    # Look for ratings file
    ratings_file = None
    for file in raw_files:
        if 'rating' in file.lower():
            ratings_file = os.path.join(raw_data_dir, file)
            break
    
    if ratings_file:
        print(f"✅ Found ratings file: {ratings_file}")
        
        # Load and process the raw ratings data
        print("📊 Loading raw ratings data...")
        if ratings_file.endswith('.dat'):
            # MovieLens .dat format
            ratings = pd.read_csv(ratings_file, sep='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')
        else:
            # CSV format
            ratings = pd.read_csv(ratings_file)
        
        print(f"✅ Loaded {len(ratings):,} raw interactions")
        print(f"📋 Columns: {list(ratings.columns)}")
        print("📋 Sample data:")
        print(ratings.head())
        
        # Create user and movie mappings
        unique_users = ratings['userId'].unique()
        unique_movies = ratings['movieId'].unique()
        user_to_index = {old: new for new, old in enumerate(unique_users)}
        movie_to_index = {old: new for new, old in enumerate(unique_movies)}
        
        print(f"📊 Users: {len(unique_users):,}, Movies: {len(unique_movies):,}")
        
        # Convert to indexed format
        ratings['user_idx'] = ratings['userId'].map(user_to_index)
        ratings['movie_idx'] = ratings['movieId'].map(movie_to_index)
        
        # Time-based split: 80% for training, 20% for future ETL
        timestamp_threshold = ratings['timestamp'].quantile(0.8)
        print(f"📅 Split timestamp (80th percentile): {timestamp_threshold}")
        
        # Split chronologically
        past_mask = ratings['timestamp'] <= timestamp_threshold
        future_mask = ratings['timestamp'] > timestamp_threshold
        
        past_data = ratings[past_mask].copy()
        future_data = ratings[future_mask].copy()
        
        print(f"📊 Past data (training): {len(past_data):,} interactions ({len(past_data)/len(ratings)*100:.1f}%)")
        print(f"📊 Future data (ETL pipeline): {len(future_data):,} interactions ({len(future_data)/len(ratings)*100:.1f}%)")
        
        # Create processed directory
        os.makedirs('data/processed', exist_ok=True)
        
        # Save processed CSV files
        past_data.to_csv('data/processed/train_data.csv', index=False)
        future_data.to_csv('data/processed/val_data.csv', index=False)
        
        print("✅ Created train_data.csv and val_data.csv")
        
        # Now create RecBole format files
        print("\n🔄 Creating RecBole format files...")
        
        # Create RecBole directory structure
        recbole_dir = 'data/recbole_format/movielens'
        os.makedirs(recbole_dir, exist_ok=True)
        
        # Create the main .inter file (all data for RecBole)
        recbole_data = ratings[['user_idx', 'movie_idx', 'rating', 'timestamp']].copy()
        recbole_data.columns = ['user_id', 'item_id', 'rating', 'timestamp']
        
        # Sort by user and timestamp (critical for RecBole)
        recbole_data = recbole_data.sort_values(['user_id', 'timestamp']).reset_index(drop=True)
        
        # Save as .inter file with proper RecBole header
        inter_file = os.path.join(recbole_dir, 'movielens.inter')
        with open(inter_file, 'w') as f:
            # Write RecBole header
            f.write('user_id:token\titem_id:token\trating:float\ttimestamp:float\n')
            # Write data
            recbole_data.to_csv(f, sep='\t', index=False, header=False)
        
        print(f"✅ Created {inter_file}: {len(recbole_data):,} interactions")
        
        # Create chronologically split .inter files for SS4Rec
        past_recbole = past_data[['user_idx', 'movie_idx', 'rating', 'timestamp']].copy()
        past_recbole.columns = ['user_id', 'item_id', 'rating', 'timestamp']
        past_recbole = past_recbole.sort_values(['user_id', 'timestamp']).reset_index(drop=True)
        
        future_recbole = future_data[['user_idx', 'movie_idx', 'rating', 'timestamp']].copy()
        future_recbole.columns = ['user_id', 'item_id', 'rating', 'timestamp']
        future_recbole = future_recbole.sort_values(['user_id', 'timestamp']).reset_index(drop=True)
        
        # Save split files
        past_file = 'data/processed/movielens_past.inter'
        future_file = 'data/processed/movielens_future.inter'
        
        with open(past_file, 'w') as f:
            f.write('user_id:token\titem_id:token\trating:float\ttimestamp:float\n')
            past_recbole.to_csv(f, sep='\t', index=False, header=False)
        
        with open(future_file, 'w') as f:
            f.write('user_id:token\titem_id:token\trating:float\ttimestamp:float\n')
            future_recbole.to_csv(f, sep='\t', index=False, header=False)
        
        print(f"✅ Created {past_file}: {len(past_recbole):,} interactions")
        print(f"✅ Created {future_file}: {len(future_recbole):,} interactions")
        
        # Save mappings
        import pickle
        mappings = {
            'user_to_index': user_to_index,
            'movie_to_index': movie_to_index,
            'n_users': len(unique_users),
            'n_movies': len(unique_movies)
        }
        
        with open('data/processed/data_mappings.pkl', 'wb') as f:
            pickle.dump(mappings, f)
        
        print("✅ Saved data mappings")
        
    else:
        print("❌ No ratings file found in raw data")
        print("💡 Please ensure MovieLens raw data is available in data/raw/")
        
else:
    print("❌ Raw data directory not found: data/raw")
    print("💡 Please download MovieLens data to data/raw/ directory")

print("\n🎯 STEP 1 COMPLETE: Data files generated!")
print("✅ train_data.csv - Processed training data")
print("✅ val_data.csv - Processed validation data") 
print("✅ movielens.inter - RecBole format (complete dataset)")
print("✅ movielens_past.inter - RecBole format (training data)")
print("✅ movielens_future.inter - RecBole format (ETL pipeline data)")


In [None]:
# STEP 2: Verify data schema is correct for RecBole
# The data should already be in the correct format, but let's verify

print("🔧 STEP 2: Verifying data schema for RecBole compatibility...")

# Check the schema of the generated files
past_file = 'data/processed/movielens_past.inter'
future_file = 'data/processed/movielens_future.inter'

if os.path.exists(past_file):
    print(f"📋 Checking schema of {past_file}:")
    past_sample = pd.read_csv(past_file, sep='\t', nrows=5)
    print(f"Columns: {list(past_sample.columns)}")
    print("Sample data:")
    print(past_sample)
    
    # Verify the schema matches RecBole requirements
    expected_columns = ['user_id:token', 'item_id:token', 'rating:float', 'timestamp:float']
    actual_columns = list(past_sample.columns)
    
    if actual_columns == expected_columns:
        print("✅ Schema is correct for RecBole!")
        print("✅ Field names and types match RecBole requirements")
    else:
        print("❌ Schema mismatch detected!")
        print(f"Expected: {expected_columns}")
        print(f"Actual: {actual_columns}")
        
        # Fix the schema if needed
        print("🔧 Fixing schema...")
        past_data = pd.read_csv(past_file, sep='\t')
        future_data = pd.read_csv(future_file, sep='\t')
        
        # Rename columns to match RecBole format
        column_mapping = {
            'userId': 'user_id:token',
            'movieId': 'item_id:token', 
            'rating': 'rating:float',
            'timestamp': 'timestamp:float'
        }
        
        # Apply mapping if needed
        for old_col, new_col in column_mapping.items():
            if old_col in past_data.columns:
                past_data = past_data.rename(columns={old_col: new_col})
                future_data = future_data.rename(columns={old_col: new_col})
        
        # Save corrected files
        past_data.to_csv(past_file, sep='\t', index=False)
        future_data.to_csv(future_file, sep='\t', index=False)
        print("✅ Schema corrected and files updated!")
        
else:
    print(f"❌ {past_file} not found. Please run Step 1 first.")

print("🎯 STEP 2 COMPLETE: Data schema verified and corrected if needed!")


In [None]:
# STEP 3: Download official SS4Rec implementation
# This step prepares the official SS4Rec code for integration

print("📥 STEP 3: Downloading official SS4Rec implementation...")

import subprocess
import os
import shutil

# Check if we're in the right directory
if not os.path.exists('runpod_entrypoint.sh'):
    print("❌ Not in MovieLens-RecSys directory. Please run this from the project root.")
else:
    print("✅ In correct directory")
    
    # Create a temporary directory for downloading
    temp_dir = '/tmp/ss4rec_official'
    if os.path.exists(temp_dir):
        print(f"🗑️ Removing existing {temp_dir}")
        shutil.rmtree(temp_dir)
    
    try:
        # Clone the official SS4Rec repository
        print("📥 Cloning official SS4Rec repository...")
        result = subprocess.run([
            'git', 'clone', 
            'https://github.com/XiaoWei-i/SS4Rec.git', 
            temp_dir
        ], capture_output=True, text=True, timeout=300)
        
        if result.returncode == 0:
            print("✅ Successfully cloned official SS4Rec repository")
            
            # Check what files we got
            if os.path.exists(f"{temp_dir}/sequential_dataset.py"):
                print("✅ Found sequential_dataset.py in official implementation")
                
                # Copy the official sequential_dataset.py to our project
                target_dir = 'models/official_ss4rec'
                os.makedirs(target_dir, exist_ok=True)
                
                shutil.copy2(f"{temp_dir}/sequential_dataset.py", f"{target_dir}/sequential_dataset_official.py")
                print(f"✅ Copied official sequential_dataset.py to {target_dir}/")
                
                # Also copy other important files if they exist
                important_files = ['SS4Rec.py', 'SS4Rec_sequential.py', 'README.md']
                for file in important_files:
                    if os.path.exists(f"{temp_dir}/{file}"):
                        shutil.copy2(f"{temp_dir}/{file}", f"{target_dir}/{file}")
                        print(f"✅ Copied {file} to {target_dir}/")
                
                print("🎯 Official SS4Rec files ready for integration!")
                
            else:
                print("❌ sequential_dataset.py not found in official repository")
                print("Available files:")
                for item in os.listdir(temp_dir):
                    print(f"  - {item}")
                    
        else:
            print(f"❌ Failed to clone repository: {result.stderr}")
            
    except subprocess.TimeoutExpired:
        print("❌ Git clone timed out. Please check your internet connection.")
    except Exception as e:
        print(f"❌ Error downloading official SS4Rec: {e}")
    
    # Clean up temporary directory
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
        print("🗑️ Cleaned up temporary directory")

print("🎯 STEP 3 COMPLETE: Official SS4Rec implementation downloaded!")
print("📁 Check models/official_ss4rec/ for the official files")
print("💡 Next: Integrate official sequential_dataset.py with RecBole")


In [None]:
# STEP 4: Test training setup and verify success criteria
# This step verifies that everything is ready for SS4Rec training

print("🧪 STEP 4: Testing training setup and verifying success criteria...")

# Check all success criteria from NEXT_STEPS.md
success_criteria = {
    "movielens_past.inter exists": False,
    "movielens_future.inter exists": False,
    "RecBole dataset loads without errors": False,
    "Official SS4Rec files available": False,
    "Training config ready": False
}

# Check 1: movielens_past.inter file exists with correct schema
past_file = 'data/processed/movielens_past.inter'
future_file = 'data/processed/movielens_future.inter'

if os.path.exists(past_file):
    success_criteria["movielens_past.inter exists"] = True
    print("✅ movielens_past.inter file exists")
    
    # Check file size
    file_size = os.path.getsize(past_file) / (1024*1024)  # MB
    print(f"   📊 File size: {file_size:.1f} MB")
    
    # Check schema
    sample = pd.read_csv(past_file, sep='\t', nrows=1)
    expected_columns = ['user_id:token', 'item_id:token', 'rating:float', 'timestamp:float']
    if list(sample.columns) == expected_columns:
        print("   ✅ Schema is correct")
    else:
        print(f"   ❌ Schema mismatch: {list(sample.columns)}")
else:
    print("❌ movielens_past.inter file missing")

# Check 2: movielens_future.inter file exists
if os.path.exists(future_file):
    success_criteria["movielens_future.inter exists"] = True
    print("✅ movielens_future.inter file exists")
    
    file_size = os.path.getsize(future_file) / (1024*1024)  # MB
    print(f"   📊 File size: {file_size:.1f} MB")
else:
    print("❌ movielens_future.inter file missing")

# Check 3: RecBole dataset loads without errors
try:
    import recbole
    print("✅ RecBole is available")
    
    # Try to load the dataset
    from recbole.data import create_dataset
    from recbole.config import Config
    
    # Check if config file exists
    config_file = 'configs/official/ss4rec_official.yaml'
    if os.path.exists(config_file):
        success_criteria["Training config ready"] = True
        print("✅ Training config file exists")
    else:
        print("❌ Training config file missing")
        
except ImportError:
    print("❌ RecBole not available (will be installed on RunPod)")
except Exception as e:
    print(f"❌ Error loading RecBole: {e}")

# Check 4: Official SS4Rec files available
official_dir = 'models/official_ss4rec'
if os.path.exists(official_dir):
    files = os.listdir(official_dir)
    if any('sequential_dataset' in f for f in files):
        success_criteria["Official SS4Rec files available"] = True
        print("✅ Official SS4Rec files available")
        print(f"   📁 Files: {files}")
    else:
        print("❌ Official SS4Rec files missing")
else:
    print("❌ Official SS4Rec directory missing")

# Summary
print("\n" + "="*50)
print("🎯 SUCCESS CRITERIA SUMMARY")
print("="*50)

all_passed = True
for criterion, passed in success_criteria.items():
    status = "✅" if passed else "❌"
    print(f"{status} {criterion}")
    if not passed:
        all_passed = False

print("="*50)
if all_passed:
    print("🎉 ALL SUCCESS CRITERIA MET!")
    print("🚀 Ready for SS4Rec training on RunPod!")
    print("💡 Run: ./runpod_entrypoint.sh --model ss4rec-official")
else:
    print("⚠️  Some criteria not met. Please address the issues above.")
    print("💡 Check NEXT_STEPS.md for detailed instructions")

print("🎯 STEP 4 COMPLETE: Training setup verification done!")


In [None]:
# Export ALL datasets with temporal features for training
print("Exporting datasets with temporal features...")

# We need to get back to the original ratings indices for each split
# The issue is X_train.index doesn't correspond to ratings.index after the splits

# Get the original indices from the splits
train_mask_indices = older_indices[X_train.index]
val_mask_indices = older_indices[X_valid.index] 
test_mask_indices = newer_indices

# Create datasets with full temporal information using original ratings data
train_ratings = ratings.iloc[train_mask_indices].copy()
val_ratings = ratings.iloc[val_mask_indices].copy()
test_ratings = ratings.iloc[test_mask_indices].copy()

print(f"Train ratings shape: {train_ratings.shape}")
print(f"Val ratings shape: {val_ratings.shape}")  
print(f"Test ratings shape: {test_ratings.shape}")

# Add user/movie indices to all datasets
train_ratings['user_idx'] = train_ratings['userId'].map(user_to_index)
train_ratings['movie_idx'] = train_ratings['movieId'].map(movie_to_index)

val_ratings['user_idx'] = val_ratings['userId'].map(user_to_index)
val_ratings['movie_idx'] = val_ratings['movieId'].map(movie_to_index)

test_ratings['user_idx'] = test_ratings['userId'].map(user_to_index)
test_ratings['movie_idx'] = test_ratings['movieId'].map(movie_to_index)

# Add temporal features to all datasets
for name, df in [("train", train_ratings), ("val", val_ratings), ("test", test_ratings)]:
    print(f"Adding temporal features to {name} data...")
    df['rating_date'] = pd.to_datetime(df['timestamp'], unit='s')
    df['rating_year'] = df['rating_date'].dt.year
    df['rating_month'] = df['rating_date'].dt.month
    df['rating_weekday'] = df['rating_date'].dt.weekday
    # Convert to string for CSV compatibility
    df['rating_date'] = df['rating_date'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Define consistent column order
columns = ['userId', 'movieId', 'rating', 'timestamp', 'user_idx', 'movie_idx', 
           'rating_date', 'rating_year', 'rating_month', 'rating_weekday']

# Prepare final datasets
train_df = train_ratings[columns].copy()
val_df = val_ratings[columns].copy()
test_df = test_ratings[columns].copy()

# Create output directory
import os
os.makedirs('data/processed', exist_ok=True)

# Export to CSV files with temporal features
train_df.to_csv('data/processed/train_data.csv', index=False)
val_df.to_csv('data/processed/val_data.csv', index=False)
test_df.to_csv('data/processed/test_data.csv', index=False)

print(f"✅ Training data exported: {len(train_df):,} samples")
print(f"✅ Validation data exported: {len(val_df):,} samples")
print(f"✅ Test data exported: {len(test_df):,} samples")

# Save mappings to processed directory
import pickle
mappings = {
   'user_to_index': user_to_index,
   'movie_to_index': movie_to_index,
   'n_users': n,
   'n_movies': m,
   'minmax': minmax
}

with open('data/processed/data_mappings.pkl', 'wb') as f:
    pickle.dump(mappings, f)

print("✅ Data mappings saved to data/processed/")
print(f"\nFinal column format: {list(train_df.columns)}")
print(f"All datasets now have consistent temporal features!")
print(f"Users: {n}, Movies: {m}")

# Create movielens.inter file with standard RecBole format
print("\nCreating movielens.inter file for RecBole...")

# Combine all data for complete dataset
full_dataset = pd.concat([train_df, val_df, test_df], ignore_index=True)

# Create RecBole format: userId:token, movieId:token, rating:float, timestamp:float
recbole_data = full_dataset[['userId', 'movieId', 'rating', 'timestamp']].copy()

# Sort by timestamp for proper chronological order
recbole_data = recbole_data.sort_values('timestamp').reset_index(drop=True)

# Export as movielens.inter (complete dataset)
recbole_data.to_csv('data/processed/movielens.inter', sep='\t', index=False)

print(f"✅ movielens.inter created with {len(recbole_data):,} interactions")
print(f"Format: userId\tmovieId\trating\ttimestamp")

# Now split the movielens.inter data chronologically: 80% past / 20% future
print("\nSplitting movielens.inter chronologically (80% past / 20% future)...")

# Calculate 80th percentile timestamp for chronological split
split_timestamp = recbole_data['timestamp'].quantile(0.8)
print(f"Split timestamp (80th percentile): {split_timestamp}")

# Split chronologically
past_data = recbole_data[recbole_data['timestamp'] <= split_timestamp].copy()
future_data = recbole_data[recbole_data['timestamp'] > split_timestamp].copy()

print(f"Past data (training): {len(past_data):,} interactions ({len(past_data)/len(recbole_data)*100:.1f}%)")
print(f"Future data (ETL pipeline): {len(future_data):,} interactions ({len(future_data)/len(recbole_data)*100:.1f}%)")

# Export chronologically split files
past_data.to_csv('data/processed/movielens_past.inter', sep='\t', index=False)
future_data.to_csv('data/processed/movielens_future.inter', sep='\t', index=False)

print(f"✅ movielens_past.inter created: {len(past_data):,} interactions (for SS4Rec training)")
print(f"✅ movielens_future.inter created: {len(future_data):,} interactions (for ETL pipeline)")

# Verify all files were created correctly
files_to_check = [
    'data/processed/movielens.inter',
    'data/processed/movielens_past.inter', 
    'data/processed/movielens_future.inter'
]

print("\nFile verification:")
for filepath in files_to_check:
    if os.path.exists(filepath):
        file_size = os.path.getsize(filepath) / (1024*1024)  # MB
        print(f"✅ {filepath}: {file_size:.1f} MB")
    else:
        print(f"❌ Error: {filepath} was not created")

print(f"\nSample of past data (for training):")
print(past_data.head())
print(f"\nSample of future data (for ETL):")
print(future_data.head())