In [16]:
pip install torch torchvision

Collecting torch
  Downloading torch-2.7.0-cp313-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting torchvision
  Downloading torchvision-0.22.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting setuptools (from torch)
  Downloading setuptools-80.3.1-py3-none-any.whl.metadata (6.5 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (4.0 kB)
Downloading torch-2.7.0-cp313-none-macosx_11_0_arm64.whl (68

In [8]:
import pandas as pd 
import numpy as np 
import sys
import os
from sklearn.cluster import KMeans

In [9]:
sys.path.append("../scripts/")

In [22]:
import config
from training import create_groupkfolds
from feature_creation import *
import joblib

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score, log_loss
)
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


# Define your neural network
class BinaryClassificationNN(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassificationNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


# 0-1 loss function
def zero_one_loss(y_true, y_pred):
    incorrect = (y_true != y_pred).sum()
    return incorrect / len(y_true)


# Training and validation function
def train_and_validate(X_train, y_train, X_val, y_val, epochs=20, batch_size=32, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_dim = X_train.shape[1]

    model = BinaryClassificationNN(input_dim).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                  torch.tensor(y_train, dtype=torch.float32))
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                                torch.tensor(y_val, dtype=torch.float32))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device).unsqueeze(1)
            preds = model(xb)
            loss = criterion(preds, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch+1}/{epochs}] Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        val_inputs = torch.tensor(X_val, dtype=torch.float32).to(device)
        val_labels = torch.tensor(y_val, dtype=torch.float32).to(device).unsqueeze(1)
        val_preds = model(val_inputs).cpu().numpy().flatten()
        val_preds_labels = (val_preds >= 0.5).astype(int)

    y_true = y_val
    y_pred = val_preds_labels
    y_proba = val_predsx

    print("\nValidation Metrics:")
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_true, y_proba))
    print("Log Loss:", log_loss(y_true, y_proba))
    print("Zero-One Loss:", zero_one_loss(y_true, y_pred))

    return model


In [20]:
df = pd.read_csv(config.TRAINING_DATA_PATH)
n_folds = 5
df = create_groupkfolds(df, n_folds, 'sentence')

In [24]:
fold = 0
model = 'nn'
df_train = df[df.kfold!=fold].reset_index(drop=True)
df_valid = df[df.kfold==fold].reset_index(drop=True)

if not os.path.exists(f'../resources/{model}'):
        os.makedirs(f'../resources/{model}')

feature_pipeline = Pipeline(steps=[
                ("Language Features",LanguageFeature()),
                ("Graph Features",GraphFeatures()),
                ("Node Features",NodeFeatures()),
                ("Dataset Creation",FormatDataFrame()),
                ("Language One Hot Encoding",LanguageOHE(enc_lan=f"{model}/lan_encoder_{model}_{fold}.pkl",\
                                                         enc_lan_family=f"{model}/lan_family_encoder_{model}_{fold}.pkl"))
            ])

train_data = feature_pipeline.fit_transform(df_train) 
valid_data = feature_pipeline.transform(df_valid)


x_train_data = train_data.drop(columns=config.TRAIN_DROP_COLS)
y_train_data = train_data.is_root.values


x_valid_data = valid_data.drop(columns=config.TRAIN_DROP_COLS)
y_valid_data = valid_data.is_root.values

scaler = MinMaxScaler()
x_train_data = scaler.fit_transform(x_train_data)
x_valid_data = scaler.transform(x_valid_data)
joblib.dump(scaler,os.path.join(config.ONE_HOT_ENCODER_LANGUAGE,f'{model}/scaler_{model}_{fold}.pkl'))

Langauge Feature Started
Langauge Feature Ended
Graph Features Creation Started
Graph Feature Creation Ended
Node Features Creation Started
Node Features Creation Ended
DataFrame Creation Started
DataFrame Creation Ended!!
One Hot Encoding Started
One Hot Encoding created and Saved
Langauge Feature Started
Langauge Feature Ended
Graph Features Creation Started
Graph Feature Creation Ended
Node Features Creation Started
Node Features Creation Ended
DataFrame Creation Started
DataFrame Creation Ended!!
One Hot Encoding Started
One Hot Encoding created and Saved




['../resources/nn/scaler_nn_0.pkl']

In [27]:
train_and_validate(x_train_data,y_train_data,x_valid_data,y_valid_data,epochs=600)

Epoch [1/600] Loss: 0.1833
Epoch [2/600] Loss: 0.0869
Epoch [3/600] Loss: 0.2973
Epoch [4/600] Loss: 0.1197
Epoch [5/600] Loss: 0.1012
Epoch [6/600] Loss: 0.0237
Epoch [7/600] Loss: 0.3316
Epoch [8/600] Loss: 0.7548
Epoch [9/600] Loss: 0.0230
Epoch [10/600] Loss: 0.0017
Epoch [11/600] Loss: 0.1821
Epoch [12/600] Loss: 0.0163
Epoch [13/600] Loss: 0.0339
Epoch [14/600] Loss: 0.0110
Epoch [15/600] Loss: 0.0205
Epoch [16/600] Loss: 0.2674
Epoch [17/600] Loss: 1.0802
Epoch [18/600] Loss: 0.0090
Epoch [19/600] Loss: 0.1162
Epoch [20/600] Loss: 0.1399
Epoch [21/600] Loss: 0.0308
Epoch [22/600] Loss: 0.0695
Epoch [23/600] Loss: 0.0548
Epoch [24/600] Loss: 0.1492
Epoch [25/600] Loss: 0.0430
Epoch [26/600] Loss: 0.3932
Epoch [27/600] Loss: 0.1012
Epoch [28/600] Loss: 0.1347
Epoch [29/600] Loss: 1.5217
Epoch [30/600] Loss: 0.0063
Epoch [31/600] Loss: 0.0124
Epoch [32/600] Loss: 0.0299
Epoch [33/600] Loss: 0.0274
Epoch [34/600] Loss: 0.0095
Epoch [35/600] Loss: 0.0484
Epoch [36/600] Loss: 0.0596
E

BinaryClassificationNN(
  (model): Sequential(
    (0): Linear(in_features=49, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
    (5): Sigmoid()
  )
)