In [2]:
# Standard library imports
import os
import warnings

# Data processing and numerical libraries
import numpy as np
import pandas as pd
import scipy.sparse as sp
import dask.dataframe as dd
from sklearn.model_selection import train_test_split

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and recommendation libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml import Pipeline
from transformers import AutoTokenizer, AutoModel

# MLflow for experiment tracking
import mlflow
import wandb

# IPython for displaying outputs
from IPython.display import display

# Suppress warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# # split data once - no temporal features
# train, temp = spark_df.randomSplit([0.75, 0.25], seed=42)
# val, test = temp.randomSplit([.15, .10], seed=42)
# # 

# # define file paths (relative to the current directory)
# model_data_path = "../../data/interim/"

# # save each DataFrame in parquet format
# train.write.parquet(os.path.join(model_data_path, f"train_set.parquet"), mode='overwrite')
# val.write.parquet(os.path.join(model_data_path, f"val_set.parquet"), mode='overwrite')
# test.write.parquet(os.path.join(model_data_path, f"test_set.parquet"), mode='overwrite')

# from pyspark.sql import SparkSession

# ========================================
# Open sessions for necessary packages
# ========================================
# spark = None

# def open_session(close=False):
#     global spark  # 
#     if not close:
#         if spark is None or spark.sparkContext is None:
#             spark = SparkSession.builder \
#                 .appName("ALS in Spark") \
#                 .getOrCreate()
#             # set up MLflow (only needs to be done once)
#     else:
#         if spark is not None:
#             spark.stop()
#             spark = None
            
# # open_session()
# open_session(close=True)

In [42]:
# READ DATA
# model_data_path = "../data/interim/"

# train = spark.read.parquet(os.path.join(model_data_path, "train_set.parquet")).toPandas()
# val = spark.read.parquet(os.path.join(model_data_path, "val_set.parquet")).toPandas()
# test = spark.read.parquet(os.path.join(model_data_path, "test_set.parquet")).toPandas()

# train.repartition(10)


In [31]:
# # define PySpark ALS model
# als = ALS(
#     userCol="user_index",
#     itemCol="bus_index",
#     ratingCol="rating",
#     coldStartStrategy="drop"
# )


# # # Grid search through hyperparameters
# # paramGrid = (ParamGridBuilder()
# #              .addGrid(als.rank, [5, 10, 15])
# #              .addGrid(als.maxIter, [5, 10, 20])
# #              .addGrid(als.regParam, [0.01, 0.1, 0.5])
# #              .build())

# # define criterion
# evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

# # define cross-validation w simple grid
# crossval = CrossValidator(
#     estimator=als,
#     evaluator=evaluator,
#     estimatorParamMaps=paramGrid,
#     numFolds=1
# )

In [38]:
# rdd = train.rdd
# partitions = rdd.glom().collect()
# for index, partition in enumerate(partitions):
#     print(f"Partition {index} contains {len(partition)} rows.")
#     if len(partition) > 0:
#         print(f"Sample data from partition {index}: {partition[:5]}")

In [39]:

# open log 
# mlflow.set_experiment("ALS_Hyperparameter_Tuning")

# log results
# with mlflow.start_run():
    
    # fit model using cross-validation
    # cv_model = crossval.fit(train)
    
    # Log the best model
    # best_model = cv_model.bestModel
    # mlflow.spark.log_model(best_model, "best_model")
    
    # # Log metrics and model parameters for each parameter combination
    # for param_map, metric in zip(crossval.getEstimatorParamMaps(), cv_model.avgMetrics):
    #     rank = param_map[als.rank]
    #     regParam = param_map[als.regParam]
    #     maxIter = param_map[als.maxIter]
        
    #     mlflow.log_param("rank", rank)
    #     mlflow.log_param("regParam", regParam)
    #     mlflow.log_param("maxIter", maxIter)
    #     mlflow.log_metric("validation_rmse", metric)

    # # Log validation scores
    # validation_predictions = best_model.transform(val)
    # validation_rmse = evaluator.evaluate(validation_predictions)
    # mlflow.log_metric("validation_rmse", validation_rmse)


# # Log test metrics (optional, after final model selection)
# test_predictions = best_model.transform(test)
# test_rmse = evaluator.evaluate(test_predictions)
# mlflow.log_metric("test_rmse", test_rmse)


In [7]:
final_df_path = "../data/processed/Final Dataframes/final_df.parquet"
tensor_path = "../data/processed/Final Tensors/"


# # Check if Mac GPU is available
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Check if nvidia gpu is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


print(f'Using device: {device}')



Using device: cpu


In [19]:
def split_and_save_data(dataframe, test_size=0.2, random_state=42):
    # split data
    train_df, temp_df = train_test_split(dataframe, test_size=test_size, random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=.5, random_state=random_state)

    # create datasets
    train_dataset = CFDataset(train_df)
    val_dataset = CFDataset(val_df)
    test_dataset = CFDataset(test_df)

    # save datasets
    torch.save(train_dataset, os.path.join(tensor_path,'train_dataset.pt'))
    torch.save(val_dataset, os.path.join(tensor_path,'val_dataset.pt'))
    torch.save(test_dataset, os.path.join(tensor_path,'test_dataset.pt'))

    print("Datasets have been split and saved.")

def load_datasets(path):
    train_dataset = torch.load(os.path.join(path,'train_dataset.pt'))
    val_dataset = torch.load(os.path.join(path,'val_dataset.pt'))
    test_dataset = torch.load(os.path.join(path,'test_dataset.pt'))

    return train_dataset, val_dataset, test_dataset


# df = pd.read_parquet(final_df_path)

# split_and_save_data(df)

# Load the datasets
train_dataset, val_dataset, test_dataset = load_datasets(tensor_path)

# print(f"Train dataset size: {len(train_dataset)}")
# print(f"Validation dataset size: {len(val_dataset)}")
# print(f"Test dataset size: {len(test_dataset)}")

In [23]:
train_dataset.features['user_id'].shape[0]

3513735

In [24]:
train_dataset.features

{'user_id': tensor([135044, 251524, 160163,  ...,  84981,  69299, 141271]),
 'business_id': tensor([ 74868,  97998,  19175,  ...,  14868,  30678, 135453]),
 'city_id': tensor([62, 33, 81,  ...,  3,  3,  2]),
 'state_id': tensor([7, 5, 1,  ..., 3, 3, 2]),
 'region_id': tensor([2, 6, 4,  ..., 0, 0, 3]),
 'dotw': tensor([6, 1, 2,  ..., 0, 3, 5]),
 'doty': tensor([357, 136, 279,  ..., 261, 209, 115]),
 'pca_features': tensor([[ 2.7052, -0.8092, -0.8594, -0.7496,  0.4360],
         [ 0.7773,  1.0229, -1.2422,  0.2214,  1.0620],
         [-0.6204,  0.8865, -0.5544,  2.0033,  0.4713],
         ...,
         [ 1.2921,  0.1585, -0.0708,  0.7642,  0.6987],
         [ 2.4021,  0.7214, -0.7895,  0.6848,  0.4224],
         [ 0.8602,  0.6749,  0.1199,  1.0643, -0.9290]]),
 'tokens': tensor([[ 2035,  2115,  5088,  ..., 21849,  7600, 28663],
         [ 1045,  2562,  4531,  ...,  2000,  7370,  2011],
         [ 2057,  2245,  2057,  ...,  2183,  2220,  2000],
         ...,
         [ 6887,  2080,  3492,

In [29]:
# Concat user and business data into one tensor
total_user_ids = torch.concat((train_dataset.features['user_id'],
                               val_dataset.features['user_id'],
                               test_dataset.features['user_id']), dim=0)
total_business_ids = torch.concat((train_dataset.features['business_id'],
                                    val_dataset.features['business_id'],
                                    test_dataset.features['business_id']), dim=0)

# get number of unique entities
num_unique_users = torch.unique(total_user_ids).shape[0] 
num_unique_bus = torch.unique(total_business_ids).shape[0]

# output findings
print(f"Number of unique users: {num_unique_users}")
print(f"Number of unique business's : {num_unique_bus}")

Number of unique users: 287116
Number of unique business's : 148523


In [13]:
class CFDataset(Dataset):
    def __init__(self, dataframe):
        self.features = {
            'user_id': torch.tensor(dataframe['user_num_id'].values, dtype=torch.long),
            'business_id': torch.tensor(dataframe['business_num_id'].values, dtype=torch.long),
            'city_id': torch.tensor(dataframe['city_code'].values, dtype=torch.long),
            'state_id': torch.tensor(dataframe['state_code'].values, dtype=torch.long),
            'region_id': torch.tensor(dataframe['region_code'].values, dtype=torch.long),
            'dotw': torch.tensor(dataframe['day_of_week'].values, dtype=torch.long),
            'doty': torch.tensor(dataframe['day_of_year'].values, dtype=torch.long),
            # 'numerical_features': torch.tensor(dataframe[['user_avg_rating_norm', 
            #                                               'bus_avg_rating_norm', 
            #                                               'log_business_review_count_norm', 
            #                                               'log_user_review_count_norm', 
            #                                               'years_yelp_member_norm', 
            #                                               'years_since_review_norm']]
            #                                    .values,
            #                                    dtype=torch.float),
            'pca_features': torch.tensor(dataframe[['pca_1', 'pca_2', 'pca_3', 
                                                    'pca_4', 'pca_5']].values, dtype=torch.float),
            'tokens': torch.tensor(dataframe['tokens'].tolist(), dtype=torch.long),
            'categories' : torch.tensor(dataframe['categories_enc'].tolist(), dtype=torch.long)
        }
        self.target = torch.tensor(dataframe['mean_centered_rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.features.items()}, self.target[idx]

In [10]:
class CFmodel(nn.Module):
    def __init__(self, vocab_size=30522, rank=32, num_users=287116, num_bus=148523, num_city=1273, 
                 num_regions=11, num_states=50, token_length=10, num_heads=4):
        super().__init__()
        self.rank = rank
        self.token_length = token_length

        # Embeddings
        self.user_emb = nn.Embedding(num_users, rank)
        self.bus_emb = nn.Embedding(num_bus, rank)
        self.city_emb = nn.Embedding(num_city, rank)
        self.state_emb = nn.Embedding(num_states, rank)
        self.region_emb = nn.Embedding(num_regions, rank)
        self.dotw_emb = nn.Embedding(7, rank)
        self.doty_emb = nn.Embedding(367, rank)
        self.token_emb = nn.Embedding(vocab_size, rank)

        # Layer for numerical features
        self.numerical_layer = nn.Sequential(
            nn.Linear(6, rank),
            nn.ReLU(),
            nn.BatchNorm1d(rank)
        )

        # Multi-head attention for combining embeddings
        self.multihead_attn = nn.MultiheadAttention(embed_dim=rank, num_heads=num_heads)
        
        # Final layers
        self.fc_layers = nn.Sequential(
            nn.Linear(rank, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 1)
        )

    def forward(self, **kwargs):
        # Process embeddings
        user_emb = self.user_emb(kwargs['user_id']).unsqueeze(0)
        bus_emb = self.bus_emb(kwargs['business_id']).unsqueeze(0)
        city_emb = self.city_emb(kwargs['city_id']).unsqueeze(0)
        state_emb = self.state_emb(kwargs['state_id']).unsqueeze(0)
        region_emb = self.region_emb(kwargs['region_id']).unsqueeze(0)
        dotw_emb = self.dotw_emb(kwargs['dotw']).unsqueeze(0)
        doty_emb = self.doty_emb(kwargs['doty']).unsqueeze(0)
        
        # Process numerical features
        numerical = self.numerical_layer(kwargs['numerical_features']).unsqueeze(0)
        
        # Process tokens (reviews)
        tokens_emb = self.token_emb(kwargs['tokens']).mean(dim=1).unsqueeze(0)  # Average pooling
        
        # Combine all features using multi-head attention
        combined_features = torch.cat([
            user_emb, bus_emb, city_emb, state_emb, region_emb, 
            dotw_emb, doty_emb, numerical, tokens_emb
        ], dim=0)
        
        attn_output, _ = self.multihead_attn(combined_features, combined_features, combined_features)
        
        # Average the attention output
        final_representation = attn_output.mean(dim=0)
        
        # Pass through final layers
        result = self.fc_layers(final_representation)
        
        return result.squeeze()

In [11]:
# # Hyperparameters
rank = 32


# Model / Optimizer Information 
batch_size = 64
lr = .001
dataset="yelp"
num_epochs = 1
betas = (0.9, 0.999)
eps=1e-08
weight_decay=.01 


# Create datasets
train_dataset = CFDataset(train)
val_dataset = CFDataset(val)
test_dataset = CFDataset(test)


# Load datasets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


# Initialize Model
model = CFmodel()
model.to(device)



CFmodel(
  (user_emb): Embedding(287116, 32)
  (bus_emb): Embedding(148523, 32)
  (city_emb): Embedding(1273, 32)
  (state_emb): Embedding(50, 32)
  (region_emb): Embedding(11, 32)
  (dotw_emb): Embedding(7, 32)
  (doty_emb): Embedding(367, 32)
  (token_emb): Embedding(30522, 32)
  (numerical_layer): Sequential(
    (0): Linear(in_features=6, out_features=32, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=32, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_f

In [13]:
# # initialize model and define optimization

criterion = nn.MSELoss()


optimizer = torch.optim.Adam(
    model.parameters(),    # Parameters of the model to optimize
    lr=lr,              # Learning rate (default is 0.001)
    betas=betas,    # Coefficients for computing running averages of gradient and its square
    eps=eps,             # Term added to the denominator to improve numerical stability
    weight_decay=weight_decay       # Weight decay (L2 penalty)
)

In [14]:
import wandb
# Initialize W&B
wandb.init(project="hyperparameter_tuning_project", config={
    # "hidden_layers": model.hidden_layers,
    "learning_rate": lr,
    "architecture": "CF",
    "dataset": "Yelp",
    "betas": betas,
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    "rank": rank
    })


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mteddybytes[0m ([33mteddybytesorg[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [15]:

def to_device(data, device):
    if isinstance(data, dict):
        return {k: v.to(device) for k, v in data.items()}
    return data.to(device)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [10]:
import torch
import torch.profiler
import wandb
from torch.utils.tensorboard import SummaryWriter

# Initialize W&B
wandb.init(project="initial modeling")  # Specify your project name

# Initialize TensorBoard
tb_writer = SummaryWriter('runs/experiment_1')

# Initialize W&B to watch the model
wandb.watch(model, log="all")

# Profiling setup
profiler = torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True
)

# Training loop with profiling
with profiler:
    for epoch in range(num_epochs):
        model.train()
        num_iter = 0
        for features, target in train_loader:
            features = to_device(features, device)
            target = to_device(target, device)

            optimizer.zero_grad()
            pred = model(**features)
            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()

            # Log gradient norms for each layer
            total_norm = 0
            for p in model.parameters():
                if p.grad is not None:
                    param_norm = p.grad.data.norm(2)  # Compute L2 norm
                    total_norm += param_norm.item() ** 2

            total_norm = total_norm ** 0.5

            # Log to TensorBoard
            tb_writer.add_scalar('Loss/train', loss.item(), epoch * len(train_loader) + num_iter)
            tb_writer.add_scalar('Gradient Norms', total_norm, epoch * len(train_loader) + num_iter)

            # Log to W&B
            wandb.log({
                'epoch': epoch,
                'iteration': num_iter,
                'train_loss': loss.item(),
                'gradient_norms': total_norm
            })

            if num_iter % 250 == 0:
                # Validation loop
                model.eval()
                val_loss = 0
                with torch.no_grad():
                    for val_features, val_target in val_loader:
                        val_features = to_device(val_features, device)
                        val_target = to_device(val_target, device)

                        val_pred = model(**val_features)
                        val_loss += criterion(val_pred, val_target).item()

                    val_loss /= len(val_loader)

                # Log metrics to TensorBoard
                tb_writer.add_scalar('Loss/validation', val_loss, epoch * len(train_loader) + num_iter)
                
                # Log metrics to W&B
                wandb.log({
                    'epoch': epoch,
                    'iteration': num_iter,
                    'train_loss': loss.item(),
                    'val_loss': val_loss
                })

                print(f"Epoch: {epoch}, Iter: {num_iter}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}")

            num_iter += 1

# Finish profiling and print results
profiler.stop()
print(profiler.key_averages().table(sort_by="cuda_time_total"))

# Save the model and log as an artifact
model_path = "model.pth"
torch.save(model.state_dict(), model_path)

# Log the model as an artifact in W&B
artifact = wandb.Artifact('model', type='model')
artifact.add_file(model_path)
wandb.log_artifact(artifact)

# Finish the W&B run
wandb.finish()

# Close the TensorBoard writer
tb_writer.close()


Epoch: 0, Iter: 0, Train Loss: 1.9933, Val Loss: 1.4511
Epoch: 0, Iter: 250, Train Loss: 1.3015, Val Loss: 1.4440
Epoch: 0, Iter: 500, Train Loss: 1.4819, Val Loss: 1.4430


: 