In [1]:
# Standard library imports
import os
import warnings

# Data processing and numerical libraries
import numpy as np
import pandas as pd
import scipy.sparse as sp
import dask.dataframe as dd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and recommendation libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml import Pipeline
from transformers import AutoTokenizer, AutoModel

# MLflow for experiment tracking
import mlflow

# IPython for displaying outputs
from IPython.display import display

# Suppress warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# # split data once - no temporal features
# train, temp = spark_df.randomSplit([0.75, 0.25], seed=42)
# val, test = temp.randomSplit([.15, .10], seed=42)
# # 

# # define file paths (relative to the current directory)
# model_data_path = "../../data/interim/"

# # save each DataFrame in parquet format
# train.write.parquet(os.path.join(model_data_path, f"train_set.parquet"), mode='overwrite')
# val.write.parquet(os.path.join(model_data_path, f"val_set.parquet"), mode='overwrite')
# test.write.parquet(os.path.join(model_data_path, f"test_set.parquet"), mode='overwrite')

# from pyspark.sql import SparkSession

# ========================================
# Open sessions for necessary packages
# ========================================
# spark = None

# def open_session(close=False):
#     global spark  # 
#     if not close:
#         if spark is None or spark.sparkContext is None:
#             spark = SparkSession.builder \
#                 .appName("ALS in Spark") \
#                 .getOrCreate()
#             # set up MLflow (only needs to be done once)
#     else:
#         if spark is not None:
#             spark.stop()
#             spark = None
            
# # open_session()
# open_session(close=True)

In [42]:
# READ DATA
# model_data_path = "../data/interim/"

# train = spark.read.parquet(os.path.join(model_data_path, "train_set.parquet")).toPandas()
# val = spark.read.parquet(os.path.join(model_data_path, "val_set.parquet")).toPandas()
# test = spark.read.parquet(os.path.join(model_data_path, "test_set.parquet")).toPandas()

# train.repartition(10)


In [31]:
# # define PySpark ALS model
# als = ALS(
#     userCol="user_index",
#     itemCol="bus_index",
#     ratingCol="rating",
#     coldStartStrategy="drop"
# )


# # # Grid search through hyperparameters
# # paramGrid = (ParamGridBuilder()
# #              .addGrid(als.rank, [5, 10, 15])
# #              .addGrid(als.maxIter, [5, 10, 20])
# #              .addGrid(als.regParam, [0.01, 0.1, 0.5])
# #              .build())

# # define criterion
# evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

# # define cross-validation w simple grid
# crossval = CrossValidator(
#     estimator=als,
#     evaluator=evaluator,
#     estimatorParamMaps=paramGrid,
#     numFolds=1
# )

In [38]:
# rdd = train.rdd
# partitions = rdd.glom().collect()
# for index, partition in enumerate(partitions):
#     print(f"Partition {index} contains {len(partition)} rows.")
#     if len(partition) > 0:
#         print(f"Sample data from partition {index}: {partition[:5]}")

In [39]:

# open log 
# mlflow.set_experiment("ALS_Hyperparameter_Tuning")

# log results
# with mlflow.start_run():
    
    # fit model using cross-validation
    # cv_model = crossval.fit(train)
    
    # Log the best model
    # best_model = cv_model.bestModel
    # mlflow.spark.log_model(best_model, "best_model")
    
    # # Log metrics and model parameters for each parameter combination
    # for param_map, metric in zip(crossval.getEstimatorParamMaps(), cv_model.avgMetrics):
    #     rank = param_map[als.rank]
    #     regParam = param_map[als.regParam]
    #     maxIter = param_map[als.maxIter]
        
    #     mlflow.log_param("rank", rank)
    #     mlflow.log_param("regParam", regParam)
    #     mlflow.log_param("maxIter", maxIter)
    #     mlflow.log_metric("validation_rmse", metric)

    # # Log validation scores
    # validation_predictions = best_model.transform(val)
    # validation_rmse = evaluator.evaluate(validation_predictions)
    # mlflow.log_metric("validation_rmse", validation_rmse)


# # Log test metrics (optional, after final model selection)
# test_predictions = best_model.transform(test)
# test_rmse = evaluator.evaluate(test_predictions)
# mlflow.log_metric("test_rmse", test_rmse)


In [48]:
# # Define paths
# model_data_path = "../data/interim/"

# # read data and conver to pandas df
# train = spark.read.parquet(os.path.join(model_data_path, "train_set.parquet")).toPandas()
# val = spark.read.parquet(os.path.join(model_data_path, "val_set.parquet")).toPandas()
# test = spark.read.parquet(os.path.join(model_data_path, "test_set.parquet")).toPandas()

# def convert_to_tensors(df):
#     ratings_tensor = torch.tensor(df['rating'].values, dtype=torch.float32)
#     user_id_tensor = torch.tensor(df['user_index'].values, dtype=torch.int64)
#     bus_id_tensor = torch.tensor(df['bus_index'].values, dtype=torch.int64)
#     return ratings_tensor, user_id_tensor, bus_id_tensor

# def save_tensors(prefix, **tensors):
#     for name, tensor in tensors.items():
#         file_path = os.path.join(model_data_path, f'{prefix}_{name}.pt')
#         torch.save(tensor, file_path)

# # Convert data to tensors
# train_tensors = convert_to_tensors(train)
# val_tensors = convert_to_tensors(val)
# test_tensors = convert_to_tensors(test)

# # Save tensors
# save_tensors('train', ratings=train_tensors[0], user_id=train_tensors[1], bus_id=train_tensors[2])
# save_tensors('val', ratings=val_tensors[0], user_id=val_tensors[1], bus_id=val_tensors[2])
# save_tensors('test', ratings=test_tensors[0], user_id=test_tensors[1], bus_id=test_tensors[2])

In [None]:
mac_data_path = "../data/interim/"

data_path = mac_data_path

pd.read_parquet()

In [2]:
mac_data_path = "../data/interim/"

data_path = mac_data_path

# # Check if Mac GPU is available
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Check if nvidia gpu is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'Using device: {device}')

def load_tensors(directory, prefix, device):
    ratings = torch.load(os.path.join(directory, f'{prefix}_ratings.pt')).to(device)
    user_id = torch.load(os.path.join(directory, f'{prefix}_user_id.pt')).to(device)
    bus_id = torch.load(os.path.join(directory, f'{prefix}_bus_id.pt')).to(device)
    return ratings, user_id, bus_id

# Load tensors and move them to the device
train_tensors = load_tensors(data_path, 'train', device)
val_tensors = load_tensors(data_path, 'val', device)
test_tensors = load_tensors(data_path, 'test', device)

# unpack 
train_ratings, train_user_id, train_bus_id = train_tensors
val_ratings, val_user_id, val_bus_id = val_tensors
test_ratings, test_user_id, test_bus_id = test_tensors




Using device: cpu


In [3]:
# Concat user and business data into one tensor
total_user_ids = torch.concat((train_user_id, val_user_id, test_user_id), dim=0)
total_business_ids = torch.concat((train_bus_id, val_bus_id, test_bus_id))

# get number of unique entities
num_unique_users = torch.unique(total_user_ids).shape[0] 
num_unique_bus = torch.unique(total_business_ids).shape[0]

# output findings
print(f"Number of unique users: {num_unique_users}")
print(f"Number of unique business's : {num_unique_bus}")

Number of unique users: 287116
Number of unique business's : 113502


In [4]:
class CustomDataset(Dataset):
    def __init__(self, business_data, user_data, ratings_data, transform=None) -> None:
        self.business_data = business_data
        self.user_data = user_data
        self.ratings_data = ratings_data
        self.transform = transform
        
    def __getitem__(self, index):
        business = self.business_data[index]
        user = self.user_data[index]
        rating = self.ratings_data[index]
        
        if self.transform:
            business, user, rating = self.transform(business, user, rating)
        
        return business, user, rating
    
    def __len__(self):
        return len(self.ratings_data)


In [5]:
# define model architecture

# initially starting with just cosine similiarity
class MatrixFact(nn.Module):
    
    def __init__ (self, rank, num_users, num_bus):
        super().__init__()
        self.rank = rank
        self.user_emb = nn.Embedding(num_users, rank)
        self.bus_emb = nn.Embedding(num_bus, rank)
        self.fc1 = nn.Linear(1,1)
        
    def forward(self, user_id, business_id):
        # get entity ids
        user_emb = self.user_emb(user_id)     
        bus_emb = self.bus_emb(business_id) 
        
        
        # # calulcate similarities
        product = user_emb * bus_emb # element wise / essentially dot product of sparse matrix 
        cos_sim = product.sum(dim=1, keepdim=True)
        result = self.fc1(cos_sim)
        
        return result.squeeze()

First iteration of the model, I will be just using cosine similiarity and storing results in mlflow. 

In [6]:
# # Hyperparameters
rank = 10
batch_size = 1000
num_epochs = 3000
# # create data 
# users = torch.randint(num_users, (data_size,), requires_grad=False)
# business = torch.randint(num_business, (data_size,), requires_grad=False)
# ratings = torch.randint(low=1, high=5, size=(data_size,), requires_grad=False).float()



In [8]:
# initialize model and define optimization

model = MatrixFact(rank, num_unique_users, num_unique_bus)
model.to(device)

criterion = nn.MSELoss()

train_data = CustomDataset(train_bus_id, train_user_id, train_ratings)
train_loader = DataLoader(train_data, batch_size, shuffle=True)


optimizer = torch.optim.Adam(
    model.parameters(),    # Parameters of the model to optimize
    lr=0.001,              # Learning rate (default is 0.001)
    betas=(0.9, 0.999),    # Coefficients for computing running averages of gradient and its square
    eps=1e-08,             # Term added to the denominator to improve numerical stability
    weight_decay=.01       # Weight decay (L2 penalty)
)

In [15]:
from mlflow.models.signature import infer_signature
# Set an experiment by name
mlflow.set_experiment("CF Initial Experiment")

# Example input data for signature
input_data = pd.DataFrame({
    'train_user_id': np.array([1]),  # Example user ID
    'train_bus_id': np.array([1])    # Example business ID
})

# Example model output (based on a dummy forward pass)
output_data = np.array([0.5])  # Assuming your model outputs a float (e.g., a probability)

# Infer the signature
signature = infer_signature(input_data, output_data)
# # Convert the tensors to numpy arrays for logging purposes
# input_example = pd.DataFrame({
#     'train_user_id': input_example_torch['train_user_id'].numpy(),
#     'train_bus_id': input_example_torch['train_bus_id'].numpy()
# })

signature

inputs: 
  ['train_user_id': long (required), 'train_bus_id': long (required)]
outputs: 
  [Tensor('float64', (-1,))]
params: 
  None

In [13]:
# Start a new MLflow run
with mlflow.start_run():
    # log hyperparameters
    mlflow.log_param("num_epochs", num_epochs)
    mlflow.log_param("learning_rate", optimizer.param_groups[0]['lr'])
    
    # log model architecture
    model_architecture = str(model)
    with open("model_architecture.txt", "w") as f:
        f.write(model_architecture)
    mlflow.log_artifact("model_architecture.txt")

    for epoch in range(num_epochs):
        model.zero_grad()
        
        # forward pass
        pred = model(train_user_id, train_bus_id)
        
        # compute loss
        loss = criterion(pred, train_ratings)
        
        # backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # log loss
        mlflow.log_metric("loss", loss.item(), step=epoch)
        
        if loss.item() < 1:
            break
        
        if epoch % 100 == 0:
            print(f'Epoch {epoch}, MSE Loss: {loss.item()}')
    
    # log the model
    mlflow.pytorch.log_model(
        model, 
        "model", 
        input_example=input_example
    )

Epoch 0, MSE Loss: 17.80499267578125
Epoch 100, MSE Loss: 16.903764724731445


KeyboardInterrupt: 