In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ssl-matrix-file/padded_matrix_file.csv


In [7]:
def sample_triplets(data, labels):
    anchor_indices = torch.arange(len(data))
    positive_indices = torch.empty(len(data), dtype=torch.long)
    negative_indices = torch.empty(len(data), dtype=torch.long)

    for i, label in enumerate(labels):
        # Sample positive instances
        positive_candidates = (labels == label).nonzero(as_tuple=False).view(-1)
        positive_candidates = positive_candidates[positive_candidates != i]
        
        if len(positive_candidates) > 0:
            positive_indices[i] = positive_candidates[torch.randint(0, len(positive_candidates), (1,))]
        else:
            positive_indices[i] = i  # Set the positive index as itself if there are no other instances with the same label
#         print('positive',positive_indices)
        # Sample negative instances
        negative_candidates = (labels != label).nonzero(as_tuple=False).view(-1)
        negative_indices[i] = negative_candidates[torch.randint(0, len(negative_candidates), (1,))]
#         print('positive',negative_indices)
        
    anchor = data[anchor_indices]
    positive = data[positive_indices]
    negative = data[negative_indices]
    return anchor, positive, negative


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import csv
import ast
import torch.nn.functional as F

In [9]:
# Your code for loading the data (unchanged)
input_file = "/kaggle/input/ssl-matrix-file/padded_matrix_file.csv"

values = []
matrix_labels = []
num_rows = 0

with open(input_file, "r") as f_input:
    reader = csv.reader(f_input)
    for row in reader:
        row_values = []
        for i in range(len(row) - 1):
            column_value = ast.literal_eval(row[i])
            row_values.append(column_value)
        values.append(torch.tensor(row_values))
        matrix_labels.append(ast.literal_eval(row[-1]))
        num_rows += 1

# matrix_labels = [label - 1 for label in matrix_labels]

In [10]:
# Define the dataset class
class SkeletonDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [11]:
# Define the transformer model with positional encoding without the classifier layer
class TransformerEncoder(nn.Module):
    def __init__(self, n_features, d_model=64, nhead=16, num_layers=2):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Linear(n_features, d_model)
        self.positional_encoding = self.generate_positional_encoding(d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead), num_layers
        )

    def generate_positional_encoding(self, d_model, max_len=204):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    def forward(self, x):
        x = self.embedding(x)
        x = x + self.positional_encoding[:, : x.size(1)]
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return x


In [12]:
# Triplet loss function
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        pos_distance = F.pairwise_distance(anchor,positive)
        neg_distance = F.pairwise_distance(anchor, negative)
        loss = F.relu(pos_distance - neg_distance + self.margin)
        return loss.mean()

In [None]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(
    torch.stack(values), torch.tensor(matrix_labels), test_size=0.2, random_state=42, stratify=torch.tensor(matrix_labels)
)

# Create train and test datasets
train_dataset = SkeletonDataset(train_data, train_labels)
test_dataset = SkeletonDataset(test_data, test_labels)

# Create data loaders for train and test sets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model = TransformerEncoder(114)
criterion = TripletLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch_idx, (data, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        # Sample triplets from the data
        anchor, positive, negative = sample_triplets(data, labels)
        # Get embeddings for anchor, positive, and negative instances
        anchor_embeddings = model(anchor)
        positive_embeddings = model(positive)
        negative_embeddings = model(negative)
        loss = criterion(anchor_embeddings, positive_embeddings, negative_embeddings)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")



Epoch 1/20, Loss: 0.4045
Epoch 2/20, Loss: 0.4018
Epoch 3/20, Loss: 0.3056
Epoch 4/20, Loss: 0.2913
Epoch 5/20, Loss: 0.2383
Epoch 6/20, Loss: 0.1339
Epoch 7/20, Loss: 0.0946
Epoch 8/20, Loss: 0.2481
Epoch 9/20, Loss: 0.1709
Epoch 10/20, Loss: 0.1969
Epoch 11/20, Loss: 0.2122
Epoch 12/20, Loss: 0.2710
Epoch 13/20, Loss: 0.1091
Epoch 14/20, Loss: 0.1606


In [None]:
import torch
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Compute embeddings for train and test data
train_embeddings = []
train_labels_list = []

model.eval()
with torch.no_grad():
    for data, labels in train_loader:
        embeddings = model(data)
        train_embeddings.append(embeddings)
        train_labels_list.append(labels)

train_embeddings = torch.cat(train_embeddings).numpy()
train_labels_list = torch.cat(train_labels_list).numpy()

test_embeddings = []
test_labels_list = []

with torch.no_grad():
    for data, labels in test_loader:
        embeddings = model(data)
        test_embeddings.append(embeddings)
        test_labels_list.append(labels)

test_embeddings = torch.cat(test_embeddings).numpy()
test_labels_list = torch.cat(test_labels_list).numpy()

# Train a k-NN classifier on the train embeddings and evaluate on test embeddings
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_embeddings, train_labels_list)

test_predictions = knn.predict(test_embeddings)
accuracy = accuracy_score(test_labels_list, test_predictions)

print(num_epochs)
print("Accuracy:", accuracy)


In [None]:
print(len(train_embeddings[0]))