In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [2]:
train = pd.read_csv("/kaggle/input/signal-cluster-classification-dataset/train.csv")
test  = pd.read_csv("/kaggle/input/signal-cluster-classification-dataset/test.csv")

# Strip column names
train.columns = train.columns.str.strip()
test.columns  = test.columns.str.strip()

train.head()


Unnamed: 0,sample_id,signal_strength,response_level,category
0,901,722.566585,153.933763,Group_C
1,1799,210.432525,454.613761,Group_B
2,1129,152.569777,431.993189,Group_B
3,965,670.294068,-6.55972,Group_C
4,395,49.418875,444.775273,Group_B


In [3]:
X = train[['signal_strength','response_level']].values
y = train['category'].values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(test[['signal_strength','response_level']])

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)




In [4]:
class SignalDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None
        if y is not None:
            self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

train_ds = SignalDataset(X_train, y_train)
val_ds   = SignalDataset(X_val, y_val)
test_ds  = SignalDataset(X_test)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=32)


In [5]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(2, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, len(np.unique(y)))
        )
    def forward(self, x):
        return self.layers(x)


In [6]:
def train_model(seed):
    torch.manual_seed(seed)
    model = Net()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
    loss_fn = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=5, verbose=False
    )

    for epoch in range(50):
        model.train()
        for xb, yb in train_dl:
            optimizer.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward()
            optimizer.step()

        # Validation F1
        model.eval()
        preds, labels = [], []
        with torch.no_grad():
            for xb, yb in val_dl:
                out = model(xb)
                preds.extend(out.argmax(1).cpu().numpy())
                labels.extend(yb.cpu().numpy())
        score = f1_score(labels, preds, average='macro')
        scheduler.step(score)
    return model


In [7]:
seeds = [42, 100, 2025]
models = [train_model(seed) for seed in seeds]




In [8]:
probs = []
for m in models:
    m.eval()
    with torch.no_grad():
        out = m(torch.tensor(X_test, dtype=torch.float32))
        probs.append(F.softmax(out, dim=1).numpy())

avg_probs = np.mean(probs, axis=0)
final_preds = avg_probs.argmax(1)

# Convert back to original labels
final_preds = le.inverse_transform(final_preds)

submission = pd.DataFrame({
    'sample_id': test['sample_id'],
    'personality_cluster': final_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,sample_id,personality_cluster
0,1369,Group_C
1,66,Group_C
2,701,Group_B
3,939,Group_A
4,1622,Group_C
