In [65]:
# Standard library imports
import os
import warnings

# Data processing and numerical libraries
import numpy as np
import pandas as pd
import scipy.sparse as sp
import dask.dataframe as dd
from sklearn.model_selection import train_test_split

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and recommendation libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml import Pipeline
from transformers import AutoTokenizer, AutoModel

# MLflow for experiment tracking
import mlflow

# IPython for displaying outputs
from IPython.display import display

# Suppress warnings
warnings.filterwarnings('ignore')


In [3]:
# # split data once - no temporal features
# train, temp = spark_df.randomSplit([0.75, 0.25], seed=42)
# val, test = temp.randomSplit([.15, .10], seed=42)
# # 

# # define file paths (relative to the current directory)
# model_data_path = "../../data/interim/"

# # save each DataFrame in parquet format
# train.write.parquet(os.path.join(model_data_path, f"train_set.parquet"), mode='overwrite')
# val.write.parquet(os.path.join(model_data_path, f"val_set.parquet"), mode='overwrite')
# test.write.parquet(os.path.join(model_data_path, f"test_set.parquet"), mode='overwrite')

# from pyspark.sql import SparkSession

# ========================================
# Open sessions for necessary packages
# ========================================
# spark = None

# def open_session(close=False):
#     global spark  # 
#     if not close:
#         if spark is None or spark.sparkContext is None:
#             spark = SparkSession.builder \
#                 .appName("ALS in Spark") \
#                 .getOrCreate()
#             # set up MLflow (only needs to be done once)
#     else:
#         if spark is not None:
#             spark.stop()
#             spark = None
            
# # open_session()
# open_session(close=True)

In [42]:
# READ DATA
# model_data_path = "../data/interim/"

# train = spark.read.parquet(os.path.join(model_data_path, "train_set.parquet")).toPandas()
# val = spark.read.parquet(os.path.join(model_data_path, "val_set.parquet")).toPandas()
# test = spark.read.parquet(os.path.join(model_data_path, "test_set.parquet")).toPandas()

# train.repartition(10)


In [31]:
# # define PySpark ALS model
# als = ALS(
#     userCol="user_index",
#     itemCol="bus_index",
#     ratingCol="rating",
#     coldStartStrategy="drop"
# )


# # # Grid search through hyperparameters
# # paramGrid = (ParamGridBuilder()
# #              .addGrid(als.rank, [5, 10, 15])
# #              .addGrid(als.maxIter, [5, 10, 20])
# #              .addGrid(als.regParam, [0.01, 0.1, 0.5])
# #              .build())

# # define criterion
# evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

# # define cross-validation w simple grid
# crossval = CrossValidator(
#     estimator=als,
#     evaluator=evaluator,
#     estimatorParamMaps=paramGrid,
#     numFolds=1
# )

In [38]:
# rdd = train.rdd
# partitions = rdd.glom().collect()
# for index, partition in enumerate(partitions):
#     print(f"Partition {index} contains {len(partition)} rows.")
#     if len(partition) > 0:
#         print(f"Sample data from partition {index}: {partition[:5]}")

In [39]:

# open log 
# mlflow.set_experiment("ALS_Hyperparameter_Tuning")

# log results
# with mlflow.start_run():
    
    # fit model using cross-validation
    # cv_model = crossval.fit(train)
    
    # Log the best model
    # best_model = cv_model.bestModel
    # mlflow.spark.log_model(best_model, "best_model")
    
    # # Log metrics and model parameters for each parameter combination
    # for param_map, metric in zip(crossval.getEstimatorParamMaps(), cv_model.avgMetrics):
    #     rank = param_map[als.rank]
    #     regParam = param_map[als.regParam]
    #     maxIter = param_map[als.maxIter]
        
    #     mlflow.log_param("rank", rank)
    #     mlflow.log_param("regParam", regParam)
    #     mlflow.log_param("maxIter", maxIter)
    #     mlflow.log_metric("validation_rmse", metric)

    # # Log validation scores
    # validation_predictions = best_model.transform(val)
    # validation_rmse = evaluator.evaluate(validation_predictions)
    # mlflow.log_metric("validation_rmse", validation_rmse)


# # Log test metrics (optional, after final model selection)
# test_predictions = best_model.transform(test)
# test_rmse = evaluator.evaluate(test_predictions)
# mlflow.log_metric("test_rmse", test_rmse)


In [48]:
# # Define paths
# model_data_path = "../data/interim/"

# # read data and conver to pandas df
# train = spark.read.parquet(os.path.join(model_data_path, "train_set.parquet")).toPandas()
# val = spark.read.parquet(os.path.join(model_data_path, "val_set.parquet")).toPandas()
# test = spark.read.parquet(os.path.join(model_data_path, "test_set.parquet")).toPandas()

# def convert_to_tensors(df):
#     ratings_tensor = torch.tensor(df['rating'].values, dtype=torch.float32)
#     user_id_tensor = torch.tensor(df['user_index'].values, dtype=torch.int64)
#     bus_id_tensor = torch.tensor(df['bus_index'].values, dtype=torch.int64)
#     return ratings_tensor, user_id_tensor, bus_id_tensor

# def save_tensors(prefix, **tensors):
#     for name, tensor in tensors.items():
#         file_path = os.path.join(model_data_path, f'{prefix}_{name}.pt')
#         torch.save(tensor, file_path)

# # Convert data to tensors
# train_tensors = convert_to_tensors(train)
# val_tensors = convert_to_tensors(val)
# test_tensors = convert_to_tensors(test)

# # Save tensors
# save_tensors('train', ratings=train_tensors[0], user_id=train_tensors[1], bus_id=train_tensors[2])
# save_tensors('val', ratings=val_tensors[0], user_id=val_tensors[1], bus_id=val_tensors[2])
# save_tensors('test', ratings=test_tensors[0], user_id=test_tensors[1], bus_id=test_tensors[2])

In [None]:
mac_data_path = "../data/interim/"

data_path = mac_data_path

pd.read_parquet()

In [102]:
mac_data_path = "../data/processed/Final Dataframes"

data_path = mac_data_path

# # Check if Mac GPU is available
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Check if nvidia gpu is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'Using device: {device}')

df = pd.read_parquet(os.path.join(mac_data_path, f"final_df.parquet"))

train, val_test = train_test_split(df, test_size=.20, shuffle=True)
val, test = train_test_split(val_test, test_size=.5, shuffle=True)


Using device: cpu


In [103]:
class CFDataset(Dataset):
    def __init__(self, dataframe):    
        # UI embeddings
        self.users = torch.tensor(dataframe['user_num_id'].values, dtype=torch.long)
        self.businesses = torch.tensor(dataframe['bus_num_id'].values, dtype=torch.long)
        # dt embeddings
        self.dotw = torch.tensor(dataframe['day_of_week'].values, dtype=torch.long)
        self.doty = torch.tensor(dataframe['day_of_year'].values, dtype=torch.long)
        # geography embeddings
        self.region_code = torch.tensor(dataframe['region_code'].values, dtype=torch.long)
        self.state_code = torch.tensor(dataframe['state_code'].values, dtype=torch.long)
        self.city_code = torch.tensor(dataframe['city_code'].values, dtype=torch.long)
        
        # Numerical features
        self.numerical_features = torch.tensor(dataframe[['user_avg_rating_norm', 
                                                          'bus_avg_rating_norm', 
                                                          'log_business_review_count_norm', 
                                                          'log_user_review_count_norm', 
                                                          'years_yelp_member_norm', 
                                                          'years_since_review_norm']]
                                               .values,
                                               dtype=torch.float)
        # tokens
        self.tokens = torch.tensor(dataframe['tokens'].tolist(), dtype=torch.long)
        
        # Target (ratings)
        self.ratings = torch.tensor(dataframe['mean_centered_rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        features = {
            'users': self.users[idx],
            'businesses': self.businesses[idx],
            'dotw': self.dotw[idx],
            'doty': self.doty[idx],
            'region_code': self.region_code[idx],
            'state_code': self.state_code[idx],
            'city_code': self.city_code[idx],
            'numerical_features': self.numerical_features[idx]
        }
        
        target = self.ratings[idx]
        
        return features, target


In [106]:
batch_size = 64  # Adjust as needed

train_dataset = CFDataset(train)
val_dataset = CFDataset(val)
test_dataset = CFDataset(test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [95]:
# Concat user and business data into one tensor
total_user_ids = torch.concat((train.users, val.users, test.users), dim=0)
total_business_ids = torch.concat((train.businesses, val.businesses, test.businesses), dim=0)

# get number of unique entities
num_unique_users = torch.unique(total_user_ids).shape[0] 
num_unique_bus = torch.unique(total_business_ids).shape[0]

# output findings
print(f"Number of unique users: {num_unique_users}")
print(f"Number of unique business's : {num_unique_bus}")

Number of unique users: 287116
Number of unique business's : 148523


In [5]:
# define model architecture

# initially starting with just cosine similiarity
class CFmodel(nn.Module):
    
    def __init__ (self, rank, num_users, num_bus):
        super().__init__()
        self.rank = rank
        self.user_emb = nn.Embedding(num_users, rank)
        self.bus_emb = nn.Embedding(num_bus, rank)
        self.fc1 = nn.Linear(1,1)
        
    def forward(self, user_id, business_id):
        # get entity ids
        user_emb = self.user_emb(user_id)     
        bus_emb = self.bus_emb(business_id) 
        
        
        # # calulcate similarities
        product = user_emb * bus_emb # element wise / essentially dot product of sparse matrix 
        cos_sim = product.sum(dim=1, keepdim=True)
        result = self.fc1(cos_sim)
        
        return result.squeeze()

First iteration of the model, I will be just using cosine similiarity and storing results in mlflow. 

In [6]:
# # Hyperparameters
rank = 10
batch_size = 1000
num_epochs = 3000
# # create data 
# users = torch.randint(num_users, (data_size,), requires_grad=False)
# business = torch.randint(num_business, (data_size,), requires_grad=False)
# ratings = torch.randint(low=1, high=5, size=(data_size,), requires_grad=False).float()



In [8]:
# initialize model and define optimization

model = MatrixFact(rank, num_unique_users, num_unique_bus)
model.to(device)

criterion = nn.MSELoss()

train_data = CustomDataset(train_bus_id, train_user_id, train_ratings)
train_loader = DataLoader(train_data, batch_size, shuffle=True)


optimizer = torch.optim.Adam(
    model.parameters(),    # Parameters of the model to optimize
    lr=0.001,              # Learning rate (default is 0.001)
    betas=(0.9, 0.999),    # Coefficients for computing running averages of gradient and its square
    eps=1e-08,             # Term added to the denominator to improve numerical stability
    weight_decay=.01       # Weight decay (L2 penalty)
)