In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, auc, accuracy_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from torch.optim.lr_scheduler import StepLR, MultiplicativeLR, ExponentialLR
import torch.nn.functional as F
import os


### Separating Train and Test Data for complete isolation and avoid data leakage

In [23]:
path = r'C:\Users\Data_Science\Downloads\trend_pulse_repo\TrendPulse\data\clean\Combined_Amazon_Dataset_Cleaned.csv'
df = pd.read_csv(path)

df = df.rename(columns={'name': 'product'})

In [24]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['main_category'], random_state=42)
train_df.to_excel(r'C:\Users\Data_Science\Desktop\trend_pulse_data\Train_data.xlsx',  index=False)
test_df.to_excel(r'C:\Users\Data_Science\Desktop\trend_pulse_data\Test_data.xlsx', index= False)

In [25]:
train_df.shape

(29639, 8)

# --------------------------------------------------------------------

### Preprocessing the separated Training Data for training and validation

In [13]:
# read the separated training data into df

df = pd.read_excel(r'C:\Users\Data_Science\Desktop\trend_pulse_data\Train_data.xlsx')

In [14]:
# Step 1: Create manual mappings for categorical columns
# Mapping products
unique_products = df['product'].unique()
product_to_index = {product: idx for idx, product in enumerate(unique_products)}
df['encoded_product'] = df['product'].map(product_to_index)

# Mapping main categories
unique_main_categories = df['main_category'].unique()
main_category_to_index = {category: idx for idx, category in enumerate(unique_main_categories)}
df['encoded_main_category'] = df['main_category'].map(main_category_to_index)

# Mapping subcategories
unique_sub_categories = df['sub_category'].unique()
sub_category_to_index = {subcategory: idx for idx, subcategory in enumerate(unique_sub_categories)}
df['encoded_sub_category'] = df['sub_category'].map(sub_category_to_index)

# Create the int_rating column using a lambda function with three categories of ratings
df['int_rating'] = df['ratings'].apply(lambda x: 1 if x <= 2.5 else (2 if x <= 4.0 else 3))


# Convert prices from rupees to USD
df['actual_price'] = df['actual_price'] / 83
df['discount_price'] = df['discount_price'] / 83

In [15]:

# Optional: Process the 'link' column if necessary or drop it
# df = train_df.drop(columns=['link'])

# Step 2: Normalize numerical columns
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical features
df[['ratings', 'no_of_ratings', 'actual_price', 'discount_price']] = scaler.fit_transform(df[['ratings', 'no_of_ratings', 'actual_price', 'discount_price']])


In [16]:
# Features and target
X = df[['encoded_product', 'encoded_main_category', 'encoded_sub_category', 'ratings', 'no_of_ratings', 'actual_price', 'discount_price']]
y = df['int_rating']

# Stratified split to maintain the distribution of int_rating across train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Combine features and target back into DataFrames
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)


In [17]:
import torch

# Convert training data to PyTorch tensors
train_product_ids = torch.tensor(train_df['encoded_product'].values, dtype=torch.long)
train_category_ids = torch.tensor(train_df['encoded_main_category'].values, dtype=torch.long)
train_subcategory_ids = torch.tensor(train_df['encoded_sub_category'].values, dtype=torch.long)
train_ratings = torch.tensor(train_df['ratings'].values, dtype=torch.float32).view(-1, 1)
train_no_of_ratings = torch.tensor(train_df['no_of_ratings'].values, dtype=torch.float32).view(-1, 1)
train_actual_price = torch.tensor(train_df['actual_price'].values, dtype=torch.float32).view(-1, 1)
train_discount_price = torch.tensor(train_df['discount_price'].values, dtype=torch.float32).view(-1, 1)
train_labels = torch.tensor(train_df['int_rating'].values, dtype=torch.long)  # CrossEntropyLoss expects long dtype for labels

# Convert validation data to PyTorch tensors
val_product_ids = torch.tensor(val_df['encoded_product'].values, dtype=torch.long)
val_category_ids = torch.tensor(val_df['encoded_main_category'].values, dtype=torch.long)
val_subcategory_ids = torch.tensor(val_df['encoded_sub_category'].values, dtype=torch.long)
val_ratings = torch.tensor(val_df['ratings'].values, dtype=torch.float32).view(-1, 1)
val_no_of_ratings = torch.tensor(val_df['no_of_ratings'].values, dtype=torch.float32).view(-1, 1)
val_actual_price = torch.tensor(val_df['actual_price'].values, dtype=torch.float32).view(-1, 1)
val_discount_price = torch.tensor(val_df['discount_price'].values, dtype=torch.float32).view(-1, 1)
val_labels = torch.tensor(val_df['int_rating'].values, dtype=torch.long)


In [19]:
# Combine tensors into TensorDatasets
train_dataset = TensorDataset(train_product_ids, train_category_ids, train_subcategory_ids, train_ratings, train_no_of_ratings, train_actual_price, train_discount_price, train_labels)
val_dataset = TensorDataset(val_product_ids, val_category_ids, val_subcategory_ids, val_ratings, val_no_of_ratings, val_actual_price, val_discount_price, val_labels)

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [24]:
# Check the first batch from the training DataLoader
for batch in train_loader:
    print(batch)
    break

8


In [201]:
'''# Convert train_data to tensors
x_cat_train = torch.tensor(train_data[['main_category', 'sub_category']].values, dtype=torch.long)
# Split the tensor into separate tensors for each category
x_main_cat_train = x_cat_train[:, 0]  # Extract the first column
x_sub_cat_train = x_cat_train[:, 1]   # Extract the second column

# Combine them into a tuple
x_cat_train_tuple = (x_main_cat_train, x_sub_cat_train)

x_num_train = torch.tensor(train_data[['ratings', 'no_of_ratings', 'discount_price', 'actual_price']].values, dtype=torch.float32)
y_train = torch.tensor(train_data['encoded_product'].values, dtype=torch.long)

# Convert val_data to tensors
x_cat_val = torch.tensor(val_data[['main_category', 'sub_category']].values, dtype=torch.long)
# Split the tensor into separate tensors for each category
x_main_cat_val = x_cat_val[:, 0]  # Extract the first column
x_sub_cat_val = x_cat_val[:, 1]   # Extract the second column

# Combine them into a tuple
x_cat_val_tuple = (x_main_cat_val, x_sub_cat_val)
x_num_val = torch.tensor(val_data[['ratings', 'no_of_ratings', 'discount_price', 'actual_price']].values, dtype=torch.float32)
y_val = torch.tensor(val_data['encoded_product'].values, dtype=torch.long)'''

### Model and Dataset class Definition

In [26]:
class RecommendationModel(nn.Module):
    def __init__(self, num_products, num_categories, num_subcategories, embedding_dim=10):
        super(RecommendationModel, self).__init__()
        
        # Embedding layers for categorical features
        self.product_embedding = nn.Embedding(num_products, embedding_dim)
        self.category_embedding = nn.Embedding(num_categories, embedding_dim // 2)
        self.subcategory_embedding = nn.Embedding(num_subcategories, embedding_dim // 2)
        
        # Define fully connected layers
        self.fc1 = nn.Linear(embedding_dim + (embedding_dim // 2) * 2 + 4, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 3)  # Output layer with 3 units for 3 classes

    def forward(self, product_id, category_id, subcategory_id, ratings, no_of_ratings, actual_price, discount_price):
        # Pass inputs through embeddings
        product_embedded = self.product_embedding(product_id)
        category_embedded = self.category_embedding(category_id)
        subcategory_embedded = self.subcategory_embedding(subcategory_id)
        
        # Flatten the embedding outputs
        product_embedded = product_embedded.view(-1, self.product_embedding.embedding_dim)
        category_embedded = category_embedded.view(-1, self.category_embedding.embedding_dim)
        subcategory_embedded = subcategory_embedded.view(-1, self.subcategory_embedding.embedding_dim)
        
        # Concatenate all features (embeddings + numerical features)
        concatenated = torch.cat((product_embedded, category_embedded, subcategory_embedded, ratings, no_of_ratings, actual_price, discount_price), dim=1)
        
        # Pass through fully connected layers
        x = torch.relu(self.fc1(concatenated))
        x = torch.relu(self.fc2(x))
        output = self.fc3(x)  # Output 3 units (for 3 classes) without activation
        
        return output

In [27]:
# Initialize metrics trackers
def compute_accuracy(predictions, labels):
    _, predicted = torch.max(predictions, 1)
    return accuracy_score(labels.cpu().numpy(), predicted.cpu().numpy())

def compute_auc(predictions, labels, num_classes):
    predictions_prob = F.softmax(predictions, dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    if num_classes == 2:
        return roc_auc_score(labels, predictions_prob[:, 1])  # For binary classification
    else:
        return roc_auc_score(labels, predictions_prob, multi_class='ovr')  # For multi-class classification
    
def logging(log_path, log_text):
    with open(log_path, 'a') as file:
        file.write(log_text + '\n')


In [28]:
main_dir = r'C:\Users\Data_Science\Desktop\trend_pulse_data'
experiment = 'Exp_1'
exp_dir = os.path.join(main_dir, experiment)
weights_dir = os.path.join(exp_dir, 'model_weights')
log_path = os.path.join(exp_dir, 'Training_log.txt')
os.makedirs(weights_dir, exist_ok=True)

### Creating Data Loaders

### Model initialization and Training parameters

In [29]:
# Initialize the model
num_products = len(df['encoded_product'].unique())
num_categories = len(df['encoded_main_category'].unique())
num_subcategories = len(df['encoded_sub_category'].unique())

model = RecommendationModel(num_products, num_categories, num_subcategories)

# Define loss function
criterion = nn.CrossEntropyLoss()  # Since it's a multi-class classification problem

# Define Optimizer and Scheduler and Learning Rate

num_epochs = 50
initial_lr = 0.001
exponent = 0.96
optimizer = optim.Adam(model.parameters(), lr=initial_lr, weight_decay=3e-3)
scheduler = ExponentialLR(optimizer, gamma=exponent)

In [30]:
print(model)

RecommendationModel(
  (product_embedding): Embedding(28588, 10)
  (category_embedding): Embedding(10, 5)
  (subcategory_embedding): Embedding(12, 5)
  (fc1): Linear(in_features=24, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=3, bias=True)
)


### Training and Validation Loop

In [198]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Move model to the appropriate device
model = model.to(device)

# Initialize best validation loss
best_val_loss = float('inf')

# Initialize lists to store predictions and labels
train_preds_list = []
train_labels_list = []
val_preds_list = []
val_labels_list = []

for epoch in range(num_epochs):
    print(f'Epoch: {epoch + 1}/{num_epochs}')
    logging(log_path, f'Epoch: {epoch + 1}/{num_epochs}')
    
    start_time = time.time()  # Start epoch timer
    
    model.train()  # Set model to training mode
    running_loss = 0.0
    train_predictions = []
    train_labels = []
    
    # Training loop
    for (x_cat, x_num), labels in train_loader:
        # Move data to the appropriate device
        x_cat = x_cat.to(device)
        x_num = x_num.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()  # Clear gradients
        
        outputs = model(x_cat, x_num)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        
        running_loss += loss.item()
        train_predictions.append(outputs.cpu())
        train_labels.append(labels.cpu())
    
    # Calculate training metrics
    train_predictions = torch.cat(train_predictions)
    train_labels = torch.cat(train_labels)
    train_loss = running_loss / len(train_loader)
    train_accuracy = compute_accuracy(train_predictions, train_labels)
    train_auc = compute_auc(train_predictions, train_labels, num_products)
    
    # Save training predictions and labels
    train_preds_list.append(train_predictions.numpy())
    train_labels_list.append(train_labels.numpy())
    
    # Validation loop
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    val_predictions = []
    val_labels = []
    with torch.no_grad():  # Disable gradient computation
        for (x_cat, x_num), labels in val_loader:
            # Move data to the appropriate device
            x_cat = x_cat.to(device)
            x_num = x_num.to(device)
            labels = labels.to(device)
            
            outputs = model(x_cat, x_num)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_predictions.append(outputs.cpu())
            val_labels.append(labels.cpu())
    
    # Calculate validation metrics
    val_predictions = torch.cat(val_predictions)
    val_labels = torch.cat(val_labels)
    val_loss = val_loss / len(val_loader)
    val_accuracy = compute_accuracy(val_predictions, val_labels)
    val_auc = compute_auc(val_predictions, val_labels, num_products)
    
    # Save validation predictions and labels
    val_preds_list.append(val_predictions.numpy())
    val_labels_list.append(val_labels.numpy())
    
    # Calculate epoch time
    epoch_time = time.time() - start_time
    
    # Learning rate
    lr = optimizer.param_groups[0]['lr']
    
    # Print metrics
    print(f'Epoch [{epoch+1}/{num_epochs}] | '
          f'Train Loss: {train_loss:.4f} | '
          f'Val Loss: {val_loss:.4f} | '
          f'Train Accuracy: {train_accuracy:.4f} | '
          f'Val Accuracy: {val_accuracy:.4f} | '
          f'Train AUC: {train_auc:.4f} | '
          f'Val AUC: {val_auc:.4f} | '
          f'Epoch Time: {epoch_time:.2f}s | '
          f'Learning Rate: {lr:.6f}')
    
    logging(log_path,
            f'Epoch [{epoch+1}/{num_epochs}] | '
          f'Train Loss: {train_loss:.4f} | '
          f'Val Loss: {val_loss:.4f} | '
          f'Train Accuracy: {train_accuracy:.4f} | '
          f'Val Accuracy: {val_accuracy:.4f} | '
          f'Train AUC: {train_auc:.4f} | '
          f'Val AUC: {val_auc:.4f} | '
          f'Epoch Time: {epoch_time:.2f}s | '
          f'Learning Rate: {lr:.6f}')
    
    # Save the best model based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        print(f'\nSaved new best model with Train_AUC: {train_auc:.3f} --- Val_AUC: {val_auc:.3f}\n')
        logging(exp_dir, f'\nSaved new best model with Train_AUC: {train_auc:.3f} --- Val_AUC: {val_auc:.3f}\n')

# After training, save all predictions and labels
import numpy as np
import pandas as pd

# Convert lists to numpy arrays
train_preds_array = np.concatenate(train_preds_list, axis=0)
train_labels_array = np.concatenate(train_labels_list, axis=0)
val_preds_array = np.concatenate(val_preds_list, axis=0)
val_labels_array = np.concatenate(val_labels_list, axis=0)

# Create dataframes
train_prob_df = pd.DataFrame({'predictions': train_preds_array.flatten(), 'labels': train_labels_array.flatten()})
val_prob_df = pd.DataFrame({'predictions': val_preds_array.flatten(), 'labels': val_labels_array.flatten()})

# Save dataframes to excel
train_prob_df.to_excel(os.path.join(exp_dir, 'train_data.xlsx'), index=False)
val_prob_df.to_excel(os.path.join(exp_dir,'val_data.xlsx'), index=False)


Using device: cpu
Epoch: 1/50


ValueError: too many values to unpack (expected 2)