In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-no

In [381]:
import os
import math
import pandas as pd
import numpy as np
import tempfile
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import Adam

In [382]:
data = pd.read_csv('train.csv')
validation_data = pd.read_csv('dev.csv')

In [383]:
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(dataframe, model):
    premise_embeddings = model.encode(dataframe['premise'].tolist(), show_progress_bar=True)
    hypothesis_embeddings = model.encode(dataframe['hypothesis'].tolist(), show_progress_bar=True)
    embeddings = np.concatenate((premise_embeddings, hypothesis_embeddings), axis=1)
    return torch.tensor(embeddings, dtype=torch.float32)

In [384]:
train_embeddings = generate_embeddings(data, sentence_model)
validation_embeddings = generate_embeddings(validation_data, sentence_model)

train_labels = torch.tensor(data['label'].values, dtype=torch.float32)
validation_labels = torch.tensor(validation_data['label'].values, dtype=torch.float32)

Batches:   0%|          | 0/842 [00:00<?, ?it/s]

Batches:   0%|          | 0/842 [00:00<?, ?it/s]

Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Batches:   0%|          | 0/211 [00:00<?, ?it/s]

In [385]:
train_embeddings = train_embeddings.view(train_embeddings.size(0), 1, -1)
validation_embeddings = validation_embeddings.view(validation_embeddings.size(0), 1, -1)

In [386]:
class AttentionLayer(nn.Module):
    def __init__(self, input_dim):
        super(AttentionLayer, self).__init__()
        self.weight = nn.Parameter(torch.randn(input_dim, 1))
        self.bias = nn.Parameter(torch.randn(1))

    def forward(self, x):
        e = torch.tanh(torch.matmul(x, self.weight) + self.bias)
        a = torch.softmax(e, dim=1)
        output = torch.sum(x * a, dim=1)
        return output, a

In [387]:
class BiGRUAttentionModel(nn.Module):
    def __init__(self, embedding_dim):
        super(BiGRUAttentionModel, self).__init__()
        self.bi_gru = nn.GRU(embedding_dim, 64, bidirectional=True, batch_first=True)
        self.attention = AttentionLayer(128)
        self.norm1 = nn.LayerNorm(128)
        self.fc1 = nn.Linear(128, 64)
        self.norm2 = nn.LayerNorm(64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x, _ = self.bi_gru(x)
        x, attn_weights = self.attention(x)
        x = self.norm1(x)
        x = self.fc1(x)
        x = self.norm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [388]:
embedding_dim = sentence_model.get_sentence_embedding_dimension() * 2
print(embedding_dim)
model = BiGRUAttentionModel(embedding_dim)

768


In [389]:
def train_model(model, train_loader, val_loader, device, save_path):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr=0.0005)
    scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3)

    best_val_accuracy = 0

    for epoch in range(15):
        model.train()
        total_loss, total_accuracy = 0, 0

        # Training loop
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            predictions = torch.sigmoid(outputs).round()
            total_accuracy += torch.sum(predictions.squeeze() == labels).item()

        avg_train_loss = total_loss / len(train_loader.dataset)
        avg_train_accuracy = total_accuracy / len(train_loader.dataset)

        # Validation loop
        model.eval()
        val_loss, val_accuracy = 0, 0
        with torch.no_grad():
            for data, labels in val_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data)
                loss = criterion(outputs.squeeze(), labels)
                val_loss += loss.item()
                predictions = torch.sigmoid(outputs).round()
                val_accuracy += torch.sum(predictions.squeeze() == labels).item()

        avg_val_loss = val_loss / len(val_loader.dataset)
        avg_val_accuracy = val_accuracy / len(val_loader.dataset)

        print(f'Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_accuracy:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {avg_val_accuracy:.4f}')

        # Learning rate scheduler step
        scheduler.step(avg_val_loss)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Current learning rate: {current_lr:.7f}")

        # Save the best model if the validation accuracy is the highest we've seen so far.
        if avg_val_accuracy > best_val_accuracy:
            best_val_accuracy = avg_val_accuracy
            torch.save(model.state_dict(), os.path.join(save_path, 'best_model2.pth'))
            print("Saved new best model")

In [390]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BiGRUAttentionModel(
  (bi_gru): GRU(768, 64, batch_first=True, bidirectional=True)
  (attention): AttentionLayer()
  (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.4, inplace=False)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)

In [391]:
train_dataset = TensorDataset(train_embeddings, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(validation_embeddings, validation_labels)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [392]:
for data, labels in train_loader:
    print(data.shape)  # Should print something like (batch_size, seq_len, embedding_dim)
    break

torch.Size([32, 1, 768])


In [393]:
# Directory where the model checkpoints will be saved
save_directory = './model_checkpoints'
os.makedirs(save_directory, exist_ok=True)

train_model(model, train_loader, val_loader, device, save_directory)

Epoch 1, Train Loss: 0.0192, Train Acc: 0.6532, Val Loss: 0.0185, Val Acc: 0.6731
Current learning rate: 0.0005000
Saved new best model
Epoch 2, Train Loss: 0.0180, Train Acc: 0.6922, Val Loss: 0.0184, Val Acc: 0.6755
Current learning rate: 0.0005000
Saved new best model
Epoch 3, Train Loss: 0.0173, Train Acc: 0.7127, Val Loss: 0.0180, Val Acc: 0.6972
Current learning rate: 0.0005000
Saved new best model
Epoch 4, Train Loss: 0.0168, Train Acc: 0.7262, Val Loss: 0.0180, Val Acc: 0.6969
Current learning rate: 0.0005000
Epoch 5, Train Loss: 0.0163, Train Acc: 0.7398, Val Loss: 0.0181, Val Acc: 0.7009
Current learning rate: 0.0005000
Saved new best model
Epoch 6, Train Loss: 0.0159, Train Acc: 0.7465, Val Loss: 0.0179, Val Acc: 0.7067
Current learning rate: 0.0005000
Saved new best model
Epoch 7, Train Loss: 0.0155, Train Acc: 0.7571, Val Loss: 0.0180, Val Acc: 0.7043
Current learning rate: 0.0005000
Epoch 8, Train Loss: 0.0151, Train Acc: 0.7686, Val Loss: 0.0182, Val Acc: 0.7091
Current 

In [400]:
# Load the best model
model_path = './model_checkpoints/best_model2.pth'
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [401]:
model.eval()

BiGRUAttentionModel(
  (bi_gru): GRU(768, 64, batch_first=True, bidirectional=True)
  (attention): AttentionLayer()
  (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.4, inplace=False)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)

In [402]:
with torch.no_grad():
    validation_embeddings = validation_embeddings.to(device)
    # Predict
    outputs = model(validation_embeddings)
    predictions = torch.sigmoid(outputs).round().cpu().numpy().astype(int)

In [403]:
# Create a DataFrame with predictions
prediction_df = pd.DataFrame(predictions, columns=['prediction'])
prediction_df

Unnamed: 0,prediction
0,1
1,0
2,1
3,0
4,1
...,...
6732,0
6733,1
6734,1
6735,0


In [404]:
# Save the DataFrame as a CSV
prediction_csv_path = 'predictions_gru1.csv'
prediction_df.to_csv(prediction_csv_path, index=False)

In [405]:
true_labels = validation_data['label'].values
predicted_labels = predictions.squeeze()

# Calculate metrics
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(true_labels, predicted_labels)
report = classification_report(true_labels, predicted_labels)

# Print out the metrics
print(f'Accuracy: {accuracy}')
print(report)

Accuracy: 0.7090693186878433
              precision    recall  f1-score   support

           0       0.73      0.64      0.68      3259
           1       0.70      0.77      0.73      3478

    accuracy                           0.71      6737
   macro avg       0.71      0.71      0.71      6737
weighted avg       0.71      0.71      0.71      6737

