In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Required Packages

In [2]:
!pip install torch_geometric



In [3]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATv2Conv, SAGEConv
import networkx as nx
import torch.nn as nn
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm as tqdm_notebook
from scipy.sparse import coo_matrix


# Preprocess data for GNN

In [4]:
df = pd.read_csv('/content/drive/Shareddrives/MLNS/final project/processed_data.csv')

Sampling 5% data

In [5]:
# Group by brand and sample 5% within each group
df = df.groupby('brand_cleaned', group_keys=False).apply(lambda x: x.sample(frac=0.05, random_state=42)).reset_index(drop=True)

  df = df.groupby('brand_cleaned', group_keys=False).apply(lambda x: x.sample(frac=0.05, random_state=42)).reset_index(drop=True)


Targeted Features

In [6]:
features = ["product_type", "product_gender_target",
            "product_material", "product_color",
            "brand_name", "product_condition", "product_like_count"]
target = "price_usd"

## Feature Processing

In [7]:
categorical_features = ["product_gender_target", "product_color",  "product_condition", "brand_cleaned", "material_cleaned", "broad_type"]
numerical_features = ["product_like_count"]

One-Hot Encoding for categorical data

In [8]:
encoder = OneHotEncoder(sparse_output=False)
categorical_encoded = encoder.fit_transform(df[categorical_features])
categorical_cols = encoder.get_feature_names_out(categorical_features)
categorical_df = pd.DataFrame(categorical_encoded, columns=categorical_cols)

Standaradization for numerical data

In [9]:
stdscaler = StandardScaler()
std_features = stdscaler.fit_transform(df[numerical_features])
numerical_df = pd.DataFrame(std_features, columns=numerical_features).reset_index(drop=True)

Concatenate numerical and categorical data

In [10]:
numerical_df_aligned = numerical_df.reset_index(drop=True)
categorical_df_aligned = categorical_df.reset_index(drop=True)
target_df = pd.DataFrame(df[target]).reset_index(drop=True)
df_finals = pd.concat([numerical_df_aligned, categorical_df_aligned, target_df], axis=1)

## Transfrom to Graph-Ready Data

In [11]:
df_finals.shape

(14637, 112)

In [12]:
# Convert input features and target
x = torch.from_numpy(df_finals.drop(columns=[target]).values).float() # convert to float tensor
y = torch.from_numpy(df_finals[target].values).float().view(-1, 1) # reshape to 2D tensor

num_nodes, row_indices, col_indices = len(df_finals), [], []


# Utility function to add edges from one-hot feature blocks
def add_edges_by_feature_block(column_block, max_nodes=1000):
    for col in column_block:
        indices = np.flatnonzero(df_finals[col].values)
        if len(indices) > max_nodes:
            indices = indices[:max_nodes]
        if len(indices) < 2:
            continue
        rows, cols = np.meshgrid(indices, indices)
        mask = rows != cols
        row_indices.extend(rows[mask].flatten())
        col_indices.extend(cols[mask].flatten())

# Collect one-hot encoded feature columns
material_cols = [col for col in df_finals.columns if col.startswith('material_cleaned_')]
category_cols = [col for col in df_finals.columns if col.startswith('product_type_cleaned_')]
brand_cols = [col for col in df_finals.columns if col.startswith('broad_type_')]

# Add edges for material, category, and brand similarity
for feature_group in [material_cols, category_cols, brand_cols]:
    add_edges_by_feature_block(feature_group)

# Add engagement-based edges using nearest neighbors on the number of likes
likes = df_finals[numerical_features].values.reshape(-1, 1)
nn_engagement = NearestNeighbors(n_neighbors=6, algorithm='auto', metric='euclidean')
nn_engagement.fit(likes)
_, indices = nn_engagement.kneighbors(likes)

row_indices.extend(np.repeat(np.arange(num_nodes), 5))  # skip self-loop
col_indices.extend(indices[:, 1:].flatten())

# Combine all into a sparse COO matrix to remove duplicates efficiently
edge_matrix = coo_matrix((np.ones(len(row_indices)), (row_indices, col_indices)), shape=(num_nodes, num_nodes))
edge_matrix.setdiag(0)  # remove self-loops if any
edge_matrix.eliminate_zeros()

# Convert to edge index for PyTorch Geometric
edge_index = torch.tensor(np.vstack((edge_matrix.row, edge_matrix.col)), dtype=torch.long)

# Create the PyTorch Geometric Data object
data = Data(x=x, edge_index=edge_index, y=y)

## Dataset Split

In [13]:
num_nodes = data.num_nodes
indices = torch.randperm(num_nodes)
train_size = int(0.8 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[indices[:train_size]] = True
test_mask[indices[train_size:]] = True

data.train_mask = train_mask
data.test_mask = test_mask

## Model Strucuture

In [14]:
class GNNPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, model_type='GCN'):
        super(GNNPredictor, self).__init__()

        if model_type == 'GCN':
            self.conv1 = GCNConv(in_channels, hidden_channels)
            self.conv2 = GCNConv(hidden_channels, out_channels)

        elif model_type == 'SAGE':
            self.conv1 = SAGEConv(in_channels, hidden_channels)
            self.conv2 = SAGEConv(hidden_channels, out_channels)

        elif model_type == 'GAT2':
            self.conv1 = GATv2Conv(in_channels, hidden_channels)
            self.conv2 = GATv2Conv(hidden_channels, out_channels)

        # Final regressor
        self.value_predict = nn.Sequential(
            nn.Linear(out_channels, hidden_channels),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_channels, 1)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        out = self.value_predict(x)
        return out

Model Training

In [15]:
def train_model(model, data, train_mask, optimizer, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index).squeeze()
        loss = F.mse_loss(out[train_mask], data.y[train_mask].squeeze())
        loss.backward()
        optimizer.step()
        if (epoch+1) % 20 == 0:
            print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
    return model

Model Evaluation

In [16]:
def evaluate_model(model, data, test_mask):
    model.eval()
    with torch.no_grad():
        pred = model(data.x, data.edge_index).squeeze()
        true = data.y[test_mask].squeeze()
        pred = pred[test_mask]
        mse = F.mse_loss(pred, true).item()
        mae = F.l1_loss(pred, true).item()
        r2 = 1 - ((true - pred) ** 2).sum() / ((true - true.mean()) ** 2).sum()
        return {'mse': mse, 'mae': mae, 'r2': r2.item()}

## Experiment

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

model = GNNPredictor(
    in_channels=data.num_features,
    hidden_channels=64,
    out_channels=32,
    model_type='GCN'
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Train and evaluate
model = train_model(model, data, data.train_mask, optimizer, epochs=100)
results = evaluate_model(model, data, data.test_mask)

print("Test Results:")
print(f"MSE: {results['mse']:.4f}")
print(f"MAE: {results['mae']:.4f}")
print(f"R²: {results['r2']:.4f}")

Epoch 20, Loss: 1007951.8125
Epoch 40, Loss: 963432.8125
Epoch 60, Loss: 934986.1875
Epoch 80, Loss: 909563.8750
Epoch 100, Loss: 886319.5625
Test Results:
MSE: 644539.8125
MAE: 355.7698
R²: 0.0546


In [18]:
model = GNNPredictor(
    in_channels=data.num_features,
    hidden_channels=64,
    out_channels=32,
    model_type='SAGE'
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Train and evaluate
model = train_model(model, data, data.train_mask, optimizer, epochs=100)
results = evaluate_model(model, data, data.test_mask)

print("Test Results:")
print(f"MSE: {results['mse']:.4f}")
print(f"MAE: {results['mae']:.4f}")
print(f"R²: {results['r2']:.4f}")

Epoch 20, Loss: 973780.4375
Epoch 40, Loss: 879697.6875
Epoch 60, Loss: 794679.6250
Epoch 80, Loss: 690361.5000
Epoch 100, Loss: 625615.8750
Test Results:
MSE: 455572.8750
MAE: 283.8397
R²: 0.3318
