In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset, DataLoader, Subset
import numpy as np

# Load the Excel file and check for NaN values
file_path = 'train_data.xlsx'
df = pd.read_excel(file_path)
df = df.dropna(subset=['Label_Sentiment'])  # Remove rows with NaN in 'Label_Sentiment'
df['Label_Sentiment'] = df['Label_Sentiment'].astype(int)  # Ensure Label_Sentiment is integer type

class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx, 1]
        tokens = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
        
        sample = {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': label
        }
        return sample

# Define the dataset
dataset = TextDataset(dataframe=df)

# Define the Text-Only Model with DistilBERT
class TextOnlyModel(nn.Module):
    def __init__(self):
        super(TextOnlyModel, self).__init__()
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        text_outputs = self.dropout(text_outputs)
        logits = self.classifier(text_outputs)
        return logits

# Initialize model, loss, and optimizer
model = TextOnlyModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, test_idx) in enumerate(skf.split(dataset, df['Label_Sentiment'])):
    print(f'Fold {fold + 1}')
    
    train_subsampler = Subset(dataset, train_idx)
    test_subsampler = Subset(dataset, test_idx)
    
    train_dataloader = DataLoader(train_subsampler, batch_size=16, shuffle=True)
    test_dataloader = DataLoader(test_subsampler, batch_size=16, shuffle=False)
    
    # Training loop with early stopping
    model.train()
    for epoch in range(10):  
        epoch_loss = 0.0
        for batch in train_dataloader:
            optimizer.zero_grad()
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
            loss = criterion(outputs, batch['label'])
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Validation step
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in test_dataloader:
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
                loss = criterion(outputs, batch['label'])
                val_loss += loss.item()
        
        print(f'Epoch {epoch + 1} - Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Evaluation
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in test_dataloader:
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
            _, preds = torch.max(outputs, 1)
            all_labels.extend(batch['label'].numpy())
            all_preds.extend(preds.numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, zero_division=1)
    precision = precision_score(all_labels, all_preds, zero_division=1)
    recall = recall_score(all_labels, all_preds, zero_division=1)
    conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

    fold_results.append({
        'fold': fold + 1,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': conf_matrix
    })
    print(f'Fold {fold + 1} - Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Average results across folds
avg_accuracy = np.mean([result['accuracy'] for result in fold_results])
avg_f1 = np.mean([result['f1'] for result in fold_results])
avg_precision = np.mean([result['precision'] for result in fold_results])
avg_recall = np.mean([result['recall'] for result in fold_results])

print(f'Average Accuracy: {avg_accuracy}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')

  from .autonotebook import tqdm as notebook_tqdm


Fold 1
Epoch 1 - Train Loss: 225.4549, Val Loss: 35.0897
Epoch 2 - Train Loss: 73.0624, Val Loss: 17.0721
Epoch 3 - Train Loss: 19.2998, Val Loss: 20.4529
Epoch 4 - Train Loss: 11.4378, Val Loss: 11.2129
Epoch 5 - Train Loss: 10.9979, Val Loss: 18.3669
Epoch 6 - Train Loss: 10.3055, Val Loss: 13.2056
Epoch 7 - Train Loss: 6.9436, Val Loss: 15.8898
Epoch 8 - Train Loss: 3.4407, Val Loss: 17.8869
Epoch 9 - Train Loss: 9.8899, Val Loss: 14.1283
Epoch 10 - Train Loss: 11.5444, Val Loss: 27.8308
Fold 1 - Accuracy: 0.9490806223479491, F1: 0.9475982532751092, Precision: 0.970193740685544, Recall: 0.9260312944523471
Confusion Matrix:
[[691  20]
 [ 52 651]]
Fold 2
Epoch 1 - Train Loss: 27.0490, Val Loss: 1.2518
Epoch 2 - Train Loss: 7.5607, Val Loss: 0.9185
Epoch 3 - Train Loss: 3.5815, Val Loss: 0.9655
Epoch 4 - Train Loss: 2.0134, Val Loss: 0.9395
Epoch 5 - Train Loss: 1.4819, Val Loss: 0.8728
Epoch 6 - Train Loss: 1.5862, Val Loss: 0.8780
Epoch 7 - Train Loss: 1.1613, Val Loss: 1.1482
Epoch 

In [3]:
torch.save(model, 'text_only_model.pth')  # Save the entire model


In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# Load the test data
test_file_path = 'test_data.xlsx'
test_df = pd.read_excel(test_file_path)
test_df = test_df.dropna(subset=['Label_Sentiment'])  # Remove rows with NaN in 'Label_Sentiment'
test_df['Label_Sentiment'] = test_df['Label_Sentiment'].astype(int)  # Ensure Label_Sentiment is integer type

class TestTextDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx, 1]
        tokens = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        label = torch.tensor(self.dataframe.iloc[idx, 2], dtype=torch.long)
        
        sample = {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': label
        }
        return sample

# Define the test dataset
test_dataset = TestTextDataset(dataframe=test_df)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define the model architecture
class TextOnlyModel(torch.nn.Module):
    def __init__(self):
        super(TextOnlyModel, self).__init__()
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = torch.nn.Dropout(p=0.3)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        text_outputs = self.dropout(text_outputs)
        logits = self.classifier(text_outputs)
        return logits

# Load the saved model
model_path = 'text_only_model.pth'  # Update with the path to your saved model
try:
    # Attempt to load the entire model
    model = torch.load(model_path)
    model.eval()
except TypeError:
    # If only state_dict was saved, reinitialize the model and load state_dict
    model = TextOnlyModel()
    model.load_state_dict(torch.load(model_path))
    model.eval()

# Evaluation on the test data
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        _, preds = torch.max(outputs, 1)
        all_labels.extend(batch['label'].numpy())
        all_preds.extend(preds.numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, zero_division=1)
precision = precision_score(all_labels, all_preds, zero_division=1)
recall = recall_score(all_labels, all_preds, zero_division=1)
conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1])

# Print metrics
print(f'Test Accuracy: {accuracy}')
print(f'Test F1 Score: {f1}')
print(f'Test Precision: {precision}')
print(f'Test Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')


  model = torch.load(model_path)


Test Accuracy: 0.4751602564102564
Test F1 Score: 0.06827880512091038
Test Precision: 0.39344262295081966
Test Recall: 0.037383177570093455
Confusion Matrix:
[[569  37]
 [618  24]]
