In [1]:
# Standard library imports
import os
import warnings
import gc

# Data processing and numerical libraries
import numpy as np
import pandas as pd
import scipy.sparse as sp
import dask.dataframe as dd
from sklearn.model_selection import train_test_split

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow
from torch.utils.tensorboard import SummaryWriter
import time
import datetime
from tqdm import tqdm


# Machine learning and recommendation libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml import Pipeline
from transformers import AutoTokenizer, AutoModel

# MLflow for experiment tracking
import mlflow
import wandb

# IPython for displaying outputs
from IPython.display import display

# Suppress warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# # split data once - no temporal features
# train, temp = spark_df.randomSplit([0.75, 0.25], seed=42)
# val, test = temp.randomSplit([.15, .10], seed=42)
# # 

# # define file paths (relative to the current directory)
# model_data_path = "../../data/interim/"

# # save each DataFrame in parquet format
# train.write.parquet(os.path.join(model_data_path, f"train_set.parquet"), mode='overwrite')
# val.write.parquet(os.path.join(model_data_path, f"val_set.parquet"), mode='overwrite')
# test.write.parquet(os.path.join(model_data_path, f"test_set.parquet"), mode='overwrite')

# from pyspark.sql import SparkSession

# ========================================
# Open sessions for necessary packages
# ========================================
# spark = None

# def open_session(close=False):
#     global spark  # 
#     if not close:
#         if spark is None or spark.sparkContext is None:
#             spark = SparkSession.builder \
#                 .appName("ALS in Spark") \
#                 .getOrCreate()
#             # set up MLflow (only needs to be done once)
#     else:
#         if spark is not None:
#             spark.stop()
#             spark = None
            
# # open_session()
# open_session(close=True)

In [42]:
# READ DATA
# model_data_path = "../data/interim/"

# train = spark.read.parquet(os.path.join(model_data_path, "train_set.parquet")).toPandas()
# val = spark.read.parquet(os.path.join(model_data_path, "val_set.parquet")).toPandas()
# test = spark.read.parquet(os.path.join(model_data_path, "test_set.parquet")).toPandas()

# train.repartition(10)


In [31]:
# # define PySpark ALS model
# als = ALS(
#     userCol="user_index",
#     itemCol="bus_index",
#     ratingCol="rating",
#     coldStartStrategy="drop"
# )


# # # Grid search through hyperparameters
# # paramGrid = (ParamGridBuilder()
# #              .addGrid(als.rank, [5, 10, 15])
# #              .addGrid(als.maxIter, [5, 10, 20])
# #              .addGrid(als.regParam, [0.01, 0.1, 0.5])
# #              .build())

# # define criterion
# evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

# # define cross-validation w simple grid
# crossval = CrossValidator(
#     estimator=als,
#     evaluator=evaluator,
#     estimatorParamMaps=paramGrid,
#     numFolds=1
# )

In [38]:
# rdd = train.rdd
# partitions = rdd.glom().collect()
# for index, partition in enumerate(partitions):
#     print(f"Partition {index} contains {len(partition)} rows.")
#     if len(partition) > 0:
#         print(f"Sample data from partition {index}: {partition[:5]}")

In [39]:

# open log 
# mlflow.set_experiment("ALS_Hyperparameter_Tuning")

# log results
# with mlflow.start_run():
    
    # fit model using cross-validation
    # cv_model = crossval.fit(train)
    
    # Log the best model
    # best_model = cv_model.bestModel
    # mlflow.spark.log_model(best_model, "best_model")
    
    # # Log metrics and model parameters for each parameter combination
    # for param_map, metric in zip(crossval.getEstimatorParamMaps(), cv_model.avgMetrics):
    #     rank = param_map[als.rank]
    #     regParam = param_map[als.regParam]
    #     maxIter = param_map[als.maxIter]
        
    #     mlflow.log_param("rank", rank)
    #     mlflow.log_param("regParam", regParam)
    #     mlflow.log_param("maxIter", maxIter)
    #     mlflow.log_metric("validation_rmse", metric)

    # # Log validation scores
    # validation_predictions = best_model.transform(val)
    # validation_rmse = evaluator.evaluate(validation_predictions)
    # mlflow.log_metric("validation_rmse", validation_rmse)


# # Log test metrics (optional, after final model selection)
# test_predictions = best_model.transform(test)
# test_rmse = evaluator.evaluate(test_predictions)
# mlflow.log_metric("test_rmse", test_rmse)


In [4]:
class CFDataset(Dataset):
    def __init__(self, dataframe):
        self.features = {
            'user_id': torch.tensor(dataframe['user_num_id'].values, dtype=torch.long),
            'business_id': torch.tensor(dataframe['business_num_id'].values, dtype=torch.long),
            # 'city_id': torch.tensor(dataframe['city_code'].values, dtype=torch.long),
            # 'state_id': torch.tensor(dataframe['state_code'].values, dtype=torch.long),
            # 'region_id': torch.tensor(dataframe['region_code'].values, dtype=torch.long),
            # 'dotw': torch.tensor(dataframe['day_of_week'].values, dtype=torch.long),
            # 'doty': torch.tensor(dataframe['day_of_year'].values, dtype=torch.long),
            # 'numerical_features': torch.tensor(dataframe[['user_avg_rating_norm', 
            #                                               'bus_avg_rating_norm', 
            #                                               'log_business_review_count_norm', 
            #                                               'log_user_review_count_norm', 
            #                                               'years_yelp_member_norm', 
            #                                               'years_since_review_norm']]
            #                                    .values,
            #                                    dtype=torch.float),
            # 'pca_features': torch.tensor(dataframe[['pca_1', 'pca_2', 'pca_3', 
            #                                         'pca_4', 'pca_5']].values, dtype=torch.float32),
            # 'tokens': torch.tensor(dataframe['tokens'].tolist(), dtype=torch.long),
            # 'categories' : torch.tensor(dataframe['categories_enc'].tolist(), dtype=torch.long)
        }
        self.target = torch.tensor(dataframe['mean_centered_rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.features.items()}, self.target[idx]

In [5]:
def split_and_save_data(dataframe, test_size=0.2, random_state=42):
    # split data
    train_df, temp_df = train_test_split(dataframe, test_size=test_size, random_state=random_state)
    val_df, test_df = train_test_split(temp_df, test_size=.5, random_state=random_state)

    # create datasets
    train_dataset = CFDataset(train_df)
    val_dataset = CFDataset(val_df)
    test_dataset = CFDataset(test_df)

    # import and save dataset as tensor
    torch.save(train_dataset, os.path.join(tensor_path, 'train_dataset.pt'))
    torch.save(val_dataset, os.path.join(tensor_path, 'val_dataset.pt'))
    torch.save(test_dataset, os.path.join(tensor_path, 'test_dataset.pt'))
    
    
    # save dataset as df
    train_df.to_parquet(os.path.join(df_path, 'train_df.parquet'))
    val_df.to_parquet(os.path.join(df_path, 'val_df.parquet'))
    test_df.to_parquet(os.path.join(df_path, 'test_df.parquet'))

    print("Datasets have been split and saved.")

def load_datasets(as_Tensor=True):
    
    if as_Tensor:
        train = torch.load(os.path.join(tensor_path, 'train_dataset.pt'))
        val= torch.load(os.path.join(tensor_path, 'val_dataset.pt'))
        test = torch.load(os.path.join(tensor_path, 'test_dataset.pt'))
    else:
        train = pd.read_parquet(os.path.join(df_path, 'train_df.parquet'))
        val = pd.read_parquet(os.path.join(df_path, 'val_df.parquet'))
        test = pd.read_parquet(os.path.join(df_path, 'test_df.parquet'))
        
    print(f"Data loaded as {'tensor' if as_Tensor else 'DataFrame'}.")

        
    return train, val, test

final_df_path = "../data/processed/Final Dataframes/final_df.parquet"
tensor_path = df_path = "../data/processed/Final Tensors/"
temp_tensor_path = "../data/processed/Temp Tensors/"

df = pd.read_parquet(final_df_path)

split_and_save_data(df)
# train_dataset, val_dataset, test_dataset = load_datasets(True)
train_df, val_df, test_df = load_datasets(False)

train_dataset = CFDataset(train_df)
val_dataset = CFDataset(val_df)


Datasets have been split and saved.
Data loaded as DataFrame.


In [6]:
train_dataset.features

{'user_id': tensor([135044, 251524, 160163,  ...,  84981,  69299, 141271]),
 'business_id': tensor([ 74868,  97998,  19175,  ...,  14868,  30678, 135453])}

In [8]:
# Concat user and business data into one tensor
total_user_ids = torch.concat((train_dataset.features['user_id'],
                               val_dataset.features['user_id'],
                               test_dataset.features['user_id']), dim=0)
total_business_ids = torch.concat((train_dataset.features['business_id'],
                                    val_dataset.features['business_id'],
                                    test_dataset.features['business_id']), dim=0)

# get number of unique entities
num_unique_users = torch.unique(total_user_ids).shape[0] 
num_unique_bus = torch.unique(total_business_ids).shape[0]

# output findings
print(f"Number of unique users: {num_unique_users}")
print(f"Number of unique business's : {num_unique_bus}")

NameError: name 'test_dataset' is not defined

In [7]:
import torch
import torch.nn as nn

class CFmodel(nn.Module):
    def __init__(self, vocab_size=30522, rank=32, num_users=287116, num_bus=148523, num_city=1273, num_pca=5,
                 num_regions=11, num_states=50, token_length=10, num_heads=4):
        super().__init__()
        self.rank = rank
        self.token_length = token_length

        # Embeddings
        self.user_emb = nn.Embedding(num_users, rank)
        self.bus_emb = nn.Embedding(num_bus, rank)
        # self.city_emb = nn.Embedding(num_city, rank)
        # self.state_emb = nn.Embedding(num_states, rank)
        # self.region_emb = nn.Embedding(num_regions, rank)
        # self.dotw_emb = nn.Embedding(7, rank)
        # self.doty_emb = nn.Embedding(367, rank)
        # self.token_emb = nn.Embedding(vocab_size, rank)



        # # Layer for numerical features
        # self.numerical_layer = nn.Sequential(
        #     nn.Linear(5, rank),
        #     nn.ReLU(),
        #     nn.BatchNorm1d(rank)
        # )

        # # Multi-head attention for combining embeddings
        # self.multihead_attn = nn.MultiheadAttention(embed_dim=rank, num_heads=num_heads)
        
        # # Final layers
        # self.fc_layers = nn.Sequential(
        #     nn.Linear(rank, 256),
        #     nn.ReLU(),
        #     nn.BatchNorm1d(256),
        #     nn.Linear(256, 128),
        #     nn.ReLU(),
        #     nn.BatchNorm1d(128),
        #     nn.Linear(128, 64),
        #     nn.ReLU(),
        #     nn.BatchNorm1d(64),
        #     nn.Linear(64, 1)
        # )

        self.fc_layers = nn.Sequential(
            nn.Linear(self.rank*2, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout1d(.2),
            nn.Linear(128, 1)
        )

    def forward(self, **kwargs):
        # Process embeddings
        user_emb = self.user_emb(kwargs['user_id'])#.unsqueeze(0)
        bus_emb = self.bus_emb(kwargs['business_id'])#.unsqueeze(0)
        # city_emb = self.city_emb(kwargs['city_id']).unsqueeze(0)
        # state_emb = self.state_emb(kwargs['state_id']).unsqueeze(0)
        # region_emb = self.region_emb(kwargs['region_id']).unsqueeze(0)
        # dotw_emb = self.dotw_emb(kwargs['dotw']).unsqueeze(0)
        # doty_emb = self.doty_emb(kwargs['doty']).unsqueeze(0)
        
        # # Process numerical features
        # numerical = self.numerical_layer(kwargs['numerical_features']).unsqueeze(0)
        # Process PCA reduced Features
        # pca_layer = self.numerical_layer(kwargs['pca_features'])

        # # Process tokens (reviews)
        # tokens_emb = self.token_emb(kwargs['tokens']).mean(dim=1).unsqueeze(0)  # Average pooling
        
        # # Combine all features using multi-head attention
        # combined_features = torch.cat([
        #     user_emb, bus_emb, city_emb, state_emb, region_emb, 
        #     dotw_emb, doty_emb, numerical, tokens_emb
        # ], dim=0)

        combined_features = torch.cat([user_emb, bus_emb], dim=1)
        
        # attn_output, _ = self.multihead_attn(combined_features, combined_features, combined_features)
        
        # Average the attention output
        # final_representation = attn_output.mean(dim=0)
        
        # Pass through final layers
        result = self.fc_layers(combined_features)
        
        return result.squeeze()

In [8]:

class EarlyStopping:
    def __init__(self, patience: int) -> None:
        self.patience = patience
        self.strikes = 0
        self.best_model = None
        self.prev_loss = float('inf')
        self.min_loss = float('inf')
        self.early_stop = False
    
    def __call__(self, val_loss: float, model: torch.nn.Module) -> bool:
        # If the current validation loss is less than the previous best
        if val_loss < self.prev_loss:
            self.prev_loss = val_loss  # Update the previous loss
            self.strikes = 0            # Reset strikes
            if val_loss < self.min_loss:  # Check if it's the best model
                self.best_model = model.state_dict()
                self.min_loss = val_loss
        else:  # If no improvement, increment strikes
            self.strikes += 1
            if self.strikes >= self.patience:  # Check for early stopping
                
                self.early_stop = True
                
                print(f"Early Stopping incurred, after {self.patience} iteration(s) of increasing validation loss. ")
                print(f"Current validation loss: {val_loss}, best validation loss: {self.min_loss}")

        return self.early_stop
    
    def get_best_model(self):
        """Return the best model's state dictionary."""
        return self.best_model
    
    def __str__(self) -> str:
        if self.strikes < self.patience:
            return (f"Current patience is {self.patience}. \n"
                    f"{self.strikes} strikes have occurred. "
                    f"{self.patience - self.strikes} strikes left.")
        else:
            return "Early stopping has been triggered."


In [9]:
# # Hyperparameters
rank = 10


# Model / Optimizer Information 
batch_size = 80000
lr = .002
dataset="yelp"
num_epochs = 20
betas = (0.9, 0.999)
eps=1e-08
weight_decay=.01 #L2 norm

# # Check if Mac GPU is available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_workers = 4 if device == "cuda" else 0
pin_memory = True if device == "cuda" else False

# Load datasets

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,         
    pin_memory=pin_memory       
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    num_workers=0,         
    pin_memory=pin_memory       
)

# val_loader = DataLoader(
#     test_dataset,
#     batch_size=batch_size,
#     num_workers=8,       
#     pin_memory=True      
# )

# # Check if Mac GPU is available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Check if nvidia gpu is available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


print(f'Using device: {device}')

Using device: mps


In [None]:
# # initialize model and define optimization
model = CFmodel(rank=10)
model.to(device)


criterion = nn.MSELoss()
early_stopper = EarlyStopping(patience=3)


optimizer = torch.optim.AdamW(
    model.parameters(),     # Parameters of the model to optimize
    lr=lr,                 # Learning rate (default is 0.001)
    betas=betas,           # Coefficients for computing running averages of gradient and its square
    eps=eps,               # Term added to the denominator to improve numerical stability
    weight_decay=weight_decay  # Weight decay (L2 penalty) applied directly
)

In [11]:
model


CFmodel(
  (user_emb): Embedding(287116, 10)
  (bus_emb): Embedding(148523, 10)
  (fc_layers): Sequential(
    (0): Linear(in_features=20, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout1d(p=0.2, inplace=False)
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [17]:
def calculate_grad_norm(model):
    # Calculate gradient norm
    total_norm = 0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)  # Compute L2 norm
            total_norm += param_norm.item() ** 2

    grad_norm = total_norm ** 0.5
    return grad_norm

def calulcate_val_loss(model):
    
    model.eval()
    val_loss = 0
    
    with torch.no_grad():
        for val_features, val_target in val_loader:
            val_features = to_device(val_features, device)
            val_target = to_device(val_target, device)

            val_pred = model(**val_features)
            val_loss += criterion(val_pred, val_target).item()

        val_loss /= len(val_loader)
        
        return val_loss


def log_performance(model, epoch=0, num_iter=0, train_loss=0, val_loss=None):
                    
    grad_norm = calculate_grad_norm(model=model)
    
    # Log performance to wandb
    wandb.log({
        'epoch': epoch,
        'iteration': num_iter,
        'loss_train': train_loss,
        'gradient_norms' : grad_norm
    })
    
    # log performance to tensorboard
    writer.add_scalar('loss/train', train_loss, epoch * len(train_loader) + num_iter)
    writer.add_scalar('gradients/norm', grad_norm, epoch * len(train_loader) + num_iter)
    
    # Log distribution of weights and grad
    for name, param in model.named_parameters():
        writer.add_histogram(f'weights/{name}', param.data, epoch * len(train_loader) + num_iter)
        writer.add_histogram(f'gradients/{name}', param.grad, epoch * len(train_loader) + num_iter)
        
        wandb.log({f'weights/{name}': param.data.cpu().numpy()})
        wandb.log({f'gradients/{name}': param.grad.data.cpu().numpy()})
          
    if val_loss:
        

        val_loss = calulcate_val_loss(model)

        # update logs 
        wandb.log({'loss/val': val_loss})
        writer.add_scalar("loss/val", val_loss)
        
        # Print current training stats
        print(f"Epoch: {epoch}, Iter: {num_iter}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        

def log_closing_stats(model, start_time, save_dir="./Model history/"):
    # Calculate and log training duration
    end_time = time.time()
    train_duration = end_time - start_time
    
    # Log training duration to wandb and TensorBoard
    wandb.log({'training_duration': train_duration})
    writer.add_scalar("training_duration", train_duration)
    
    # Print training duration
    print(f"Total training time: {train_duration / 60:.2f} minutes")
    
    # Save the model with a timestamp
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 
    model_path = f"{save_dir}model_{timestamp}.pth" # Create the model path
    
    torch.save(model.state_dict(), model_path) # Save the model state
    
    # Log model to wandb as an artifact
    artifact = wandb.Artifact('model', type='model')
    artifact.add_file(model_path) # Attach model file
    wandb.log_artifact(artifact) # Log artifact to wandb
    

    return train_duration

# load data onto gpu
def to_device(data, device):
    if isinstance(data, dict):
        return {k: v.to(device) for k, v in data.items()}
    return data.to(device)

        

In [18]:
# watch -n 1 nvidia-smi

# Initialize W&B to watch the model
wandb.init(project="New Model", config={
    "descr": " user emb only",
    "learning_rate": lr,
    "betas": betas,
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    "rank": rank,
    "weight_decay": weight_decay,
    "model_architecture": model
})


log_dir = os.path.join("./../models/logs/Experiments")
writer = SummaryWriter(log_dir)



In [19]:
# Record start time
start_time = time.time()

wandb.watch(model, log="all")

# Training loop with progress bar
for epoch in range(num_epochs):
    
    model.train()
    
    for num_iter, (features, target) in enumerate(tqdm(train_loader,
                                                       desc=f"Epoch {epoch+1}/{num_epochs}",
                                                       unit="batch")):

        optimizer.zero_grad() # zero gradients
       
        # move to gpu
        features = to_device(features, device) 
        target = to_device(target, device)

        pred = model(**features) #foward pass
        loss = criterion(pred, target) #mse
        loss.backward() #backward pass
        optimizer.step() #update grad

        last_iteration = True if num_iter == (len(train_loader) - 1) else False
        
        if last_iteration:
            val_loss = calulcate_val_loss(model)
            
        else:
            val_loss = None
            
        log_performance(model=model,
                        epoch=epoch, 
                        num_iter=num_iter, 
                        train_loss=loss.item(), 
                        val_loss=val_loss
                        )
        
    # Check for early stopping condition
    stop = early_stopper(val_loss, model)
    
    if stop:
        break
    
train_duration = log_closing_stats(model, start_time=start_time)

# Print the total training time
print(f"Total training time: {train_duration / 60:.2f}")

wandb.finish()


Epoch 1/20: 100%|██████████| 44/44 [01:08<00:00,  1.57s/batch]


Epoch: 0, Iter: 43, Train Loss: 1.4443, Val Loss: 1.4467


Epoch 2/20: 100%|██████████| 44/44 [01:07<00:00,  1.53s/batch]


Epoch: 1, Iter: 43, Train Loss: 1.4514, Val Loss: 1.4449


Epoch 3/20: 100%|██████████| 44/44 [01:07<00:00,  1.52s/batch]


Epoch: 2, Iter: 43, Train Loss: 1.4346, Val Loss: 1.4421


Epoch 4/20: 100%|██████████| 44/44 [01:06<00:00,  1.52s/batch]


Epoch: 3, Iter: 43, Train Loss: 1.4197, Val Loss: 1.4365


Epoch 5/20: 100%|██████████| 44/44 [01:06<00:00,  1.51s/batch]


Epoch: 4, Iter: 43, Train Loss: 1.4131, Val Loss: 1.4267


Epoch 6/20: 100%|██████████| 44/44 [01:07<00:00,  1.54s/batch]


Epoch: 5, Iter: 43, Train Loss: 1.3955, Val Loss: 1.4112


Epoch 7/20: 100%|██████████| 44/44 [01:07<00:00,  1.53s/batch]


Epoch: 6, Iter: 43, Train Loss: 1.3717, Val Loss: 1.3932


Epoch 8/20: 100%|██████████| 44/44 [01:07<00:00,  1.52s/batch]


Epoch: 7, Iter: 43, Train Loss: 1.3446, Val Loss: 1.3755


Epoch 9/20: 100%|██████████| 44/44 [01:05<00:00,  1.48s/batch]


Epoch: 8, Iter: 43, Train Loss: 1.3303, Val Loss: 1.3591


Epoch 10/20: 100%|██████████| 44/44 [01:05<00:00,  1.49s/batch]


Epoch: 9, Iter: 43, Train Loss: 1.3152, Val Loss: 1.3464


Epoch 11/20: 100%|██████████| 44/44 [01:05<00:00,  1.49s/batch]


Epoch: 10, Iter: 43, Train Loss: 1.2841, Val Loss: 1.3365


Epoch 12/20: 100%|██████████| 44/44 [01:05<00:00,  1.49s/batch]


Epoch: 11, Iter: 43, Train Loss: 1.2532, Val Loss: 1.3295


Epoch 13/20: 100%|██████████| 44/44 [01:06<00:00,  1.50s/batch]


Epoch: 12, Iter: 43, Train Loss: 1.2502, Val Loss: 1.3252


Epoch 14/20: 100%|██████████| 44/44 [01:05<00:00,  1.49s/batch]


Epoch: 13, Iter: 43, Train Loss: 1.2327, Val Loss: 1.3223


Epoch 15/20: 100%|██████████| 44/44 [01:05<00:00,  1.49s/batch]


Epoch: 14, Iter: 43, Train Loss: 1.2099, Val Loss: 1.3227


Epoch 16/20: 100%|██████████| 44/44 [01:05<00:00,  1.49s/batch]


Epoch: 15, Iter: 43, Train Loss: 1.1908, Val Loss: 1.3249


Epoch 17/20: 100%|██████████| 44/44 [01:05<00:00,  1.50s/batch]

Epoch: 16, Iter: 43, Train Loss: 1.1708, Val Loss: 1.3268
Early Stopping incurred, after 3 iteration(s) of increasing validation loss. 
Current validation loss: 1.3268404603004456, best validation loss: 1.32227623462677
Total training time: 18.82 minutes





Total training time: 18.82


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████
gradient_norms,▅▃▄▅▂▂▃▂▁▃▇▃▂▂▃▄▄▂▃▄▃▄▄▃▄▅▅▆▅▄▅▇█▇▇▆▅▄▆▆
gradients/fc_layers.4.bias,▄▂▄▄▂▄▄▅▂▁▃▄▄▄▂▃▃▃▃▄▃▃▃▃▄▅▄▃▆▃▄▃▆▃▄▄▄█▄▄
iteration,▃▂▃▇▁▇▃█▂▄▅▃▆▂▆▁▂▆▇█▅▅▆▃█▄▅▅▇▄▄▇█▄▅▆▁▂▃▇
loss/val,███▇▇▆▅▄▃▂▂▁▁▁▁▁▁
loss_train,████▇▇▇▇▇▇▇▇▇▇▇▆▆▅▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▂▁
training_duration,▁
weights/fc_layers.4.bias,█▆▅▅▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁

0,1
epoch,16.0
gradient_norms,0.2275
gradients/fc_layers.4.bias,-0.00674
iteration,43.0
loss/val,1.32684
loss_train,1.17079
training_duration,1129.2151
weights/fc_layers.4.bias,0.01603


In [18]:
#Close session
wandb.finish()

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
gradient_norms,▄█▂▂▄▃▁▄▃▂▃▂▃▄▁▃▃▃▂▅▃▃▃▂▅▃▄▃▂▃▃▂▄▃▂▃▄▃▃▄
iteration,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss_train,▄▅▆▄▄▄▄▆▇▅▅▇▃▃▄▅▅█▃▆▆▅▄▄▅▁▅▄▄▅▅▅▆▅▄▅▆▄▇▅
loss_val,▁
training_duration,▁

0,1
epoch,0.0
gradient_norms,0.28267
iteration,43.0
loss_train,1.14684
loss_val,1.33425
training_duration,54.63309


In [37]:
# Save model
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%:%M%:%S") 

In [38]:
timestamp

'2024-09-19_22:43:06'

In [185]:
# Function to generate random experiment names
def random_experiment_name():
    return ''.join(random.choices(string.ascii_letters + string.digits, k=8))

trying = random_experiment_name()

In [184]:
import random
import string

In [186]:
trying

'WSUnp3qW'

In [3]:
tokens = torch.load('/Users/matthewaudley/Documents/Machine Learning Projects/Business Recommendation System/data/processed/Temp Tensors/input_ids.pt')
att = torch.load('/Users/matthewaudley/Documents/Machine Learning Projects/Business Recommendation System/data/processed/Temp Tensors/mask.pt')

In [8]:
import torch
from transformers import DistilBertModel
from tqdm import tqdm

class DistilBertEmb(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        # Freeze parameters, do not want to fine-tune DistilBERT
        for param in self.distilbert_model.parameters():
            param.requires_grad = False

    def forward(self, tokens, att_mask):
        # Forward pass
        outputs = self.distilbert_model(tokens, attention_mask=att_mask)
        # Get embeddings
        emb = outputs.last_hidden_state
        # Mean Pooling
        pooled_tokens_emb = emb.mean(dim=1)  # Across sequence dimension
        return pooled_tokens_emb

def compute_embeddings(token_tensor, att_mask_tensor, distilbert_model, batch_size=128):  
    # Ensure the model is in eval mode
    distilbert_model.eval()

    # List to store embeddings
    all_embeddings = []

    # Move the tensors to the appropriate device (MPS if available)
    token_tensor = token_tensor.to('mps') if torch.backends.mps.is_available() else token_tensor.to('cpu')
    att_mask_tensor = att_mask_tensor.to('mps') if torch.backends.mps.is_available() else att_mask_tensor.to('cpu')

    # Loop over batches of data
    for i in tqdm(range(0, token_tensor.size(0), batch_size)):
        # Get the current batch
        batch_tokens = token_tensor[i:i + batch_size]
        batch_att_mask = att_mask_tensor[i:i + batch_size]

        # Compute embeddings with mixed precision
        with torch.no_grad():  # Disable gradients for faster computation
            with torch.cuda.amp.autocast():  # Use mixed precision
                embeddings = distilbert_model(batch_tokens, batch_att_mask)

        # Append the embeddings to the list
        all_embeddings.append(embeddings.cpu())  # Move embeddings to CPU for storage

    # Concatenate all embeddings into a single tensor
    embeddings_tensor = torch.cat(all_embeddings, dim=0)

    return embeddings_tensor

# Usage example
# Initialize your DistilBERT embedding model
distilbert_model = DistilBertEmb().to('mps') if torch.backends.mps.is_available() else DistilBertEmb()  # Move to MPS if available

# Assuming `tokens` and `att_masks` are your input tensors
embeddings_tensor = compute_embeddings(tokens, att, distilbert_model)

# Save the resulting embeddings tensor
torch.save(embeddings_tensor, 'embeddings_tensor.pt')


RuntimeError: MPS backend out of memory (MPS allocated: 17.73 GB, other allocations: 656.00 KB, max allowed: 18.13 GB). Tried to allocate 5.24 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [17]:
import pyautogui
import random
import time

while True:
    # Generate a random scroll amount between -3 and 3
    scroll_amount = random.randint(-3, 3)
    
    # Scroll the mouse
    pyautogui.scroll(scroll_amount)
    
    # Generate a random wait time between 30 and 120 seconds
    wait_time = random.randint(30, 120)
    
    # Wait for the random amount of time before scrolling again
    time.sleep(wait_time)


KeyboardInterrupt: 

In [10]:
!pip install pyautogui

Collecting pyautogui
  Downloading PyAutoGUI-0.9.54.tar.gz (61 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pymsgbox (from pyautogui)
  Downloading PyMsgBox-1.0.9.tar.gz (18 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pytweening>=1.0.4 (from pyautogui)
  Downloading pytweening-1.2.0.tar.gz (171 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pyscreeze>=0.1.21 (from pyautogui)
  Downloading pyscreeze-1.0.1.tar.gz (27 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pygetwindow>=0.0.5 (from pyautogui)
  Downloading PyGetWindow-0.0.9.tar.gz (9.7 kB)
  Prepa