In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import torchvision.transforms as transforms
import torchvision.models as models

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ DATA SPLITTING
# ================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ================================================
# ✅ 5️⃣ DEVICE SETUP
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ================================================
# ✅ 6️⃣ IMAGE TRANSFORMS
# ================================================
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# ================================================
# ✅ 7️⃣ VISION DATASET
# ================================================
class VisionDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        label = row['label']
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# ================================================
# ✅ 8️⃣ DATALOADERS
# ================================================
batch_size = 32

train_dataset = VisionDataset(train_df, transform=train_transform)
val_dataset = VisionDataset(val_df, transform=val_test_transform)
test_dataset = VisionDataset(test_df, transform=val_test_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# ================================================
# ✅ 9️⃣ VGG19 MODEL
# ================================================
class VGG19Classifier(nn.Module):
    def __init__(self, num_classes=3, dropout_rate=0.5):
        super(VGG19Classifier, self).__init__()
        
        # Load pre-trained VGG19
        self.vgg19 = models.vgg19(pretrained=True)
        
        # Freeze early layers (optional - you can experiment with this)
        for param in self.vgg19.features[:20].parameters():
            param.requires_grad = False
        
        # Replace the classifier
        self.vgg19.classifier = nn.Sequential(
            nn.Linear(25088, 4096),
            nn.ReLU(True),
            nn.Dropout(dropout_rate),
            nn.Linear(4096, 2048),
            nn.ReLU(True),
            nn.Dropout(dropout_rate),
            nn.Linear(2048, 1024),
            nn.ReLU(True),
            nn.Dropout(dropout_rate),
            nn.Linear(1024, num_classes)
        )
    
    def forward(self, x):
        return self.vgg19(x)

# ================================================
# ✅ 🔟 MODEL INITIALIZATION
# ================================================
model = VGG19Classifier(num_classes=3, dropout_rate=0.5).to(device)

# ================================================
# ✅ 1️⃣1️⃣ LOSS & OPTIMIZER
# ================================================
# Calculate class weights for imbalanced dataset
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))

# Optimizer with different learning rates for different parts
optimizer = AdamW([
    {'params': model.vgg19.features.parameters(), 'lr': 1e-5},  # Lower LR for pre-trained features
    {'params': model.vgg19.classifier.parameters(), 'lr': 1e-4}  # Higher LR for new classifier
], weight_decay=1e-4)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# ================================================
# ✅ 1️⃣2️⃣ TRAINING LOOP
# ================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

print("Starting training...")
print(f"Train samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for images, labels in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        
        # Store predictions for metrics
        predictions = torch.argmax(outputs, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            
            # Store predictions for metrics
            predictions = torch.argmax(outputs, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    
    # Update learning rate
    scheduler.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
    print("-" * 50)

    # ============================================================
    # EARLY STOPPING CHECK
    # ============================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_vgg19_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break

# ================================================
# ✅ 1️⃣3️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_vgg19_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for images, labels in tqdm(test_loader, desc="Final Test Evaluation"):
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(outputs, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate final metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

print("\n📊 FINAL TEST RESULTS (Vision-Only VGG19):")
print("=" * 50)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")
print(f"\nConfusion Matrix:\n{cm}")

# ================================================
# ✅ 1️⃣4️⃣ DETAILED CLASSIFICATION REPORT
# ================================================
from sklearn.metrics import classification_report

print("\n📋 Detailed Classification Report:")
print(classification_report(test_labels, test_predictions, 
                          target_names=['Class 0', 'Class 1', 'Class 2']))

# ================================================
# ✅ 1️⃣5️⃣ SAVE RESULTS
# ================================================
results_dict = {
    'test_accuracy': test_accuracy,
    'test_precision': precision,
    'test_recall': recall,
    'test_f1': f1,
    'test_loss': total_test_loss/len(test_loader),
    'confusion_matrix': cm.tolist()
}

import json
with open('/kaggle/working/vgg19_results.json', 'w') as f:
    json.dump(results_dict, f, indent=2)

print("\n✅ Results saved to '/kaggle/working/vgg19_results.json'")
print("✅ Best model saved as 'best_vgg19_model.pt'")

Using device: cuda


Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:02<00:00, 197MB/s]


Starting training...
Train samples: 3156
Validation samples: 451
Test samples: 902


Train Epoch 1: 100%|██████████| 99/99 [00:41<00:00,  2.37it/s]
Validation Epoch 1: 100%|██████████| 15/15 [00:05<00:00,  2.55it/s]


Epoch [1/20]
Train Loss: 0.9941 | Train Acc: 0.5098
Val Loss: 0.9543 | Val Acc: 0.5233
Learning Rate: 0.000010
--------------------------------------------------
✅ Validation loss improved — model saved.


Train Epoch 2: 100%|██████████| 99/99 [00:38<00:00,  2.58it/s]
Validation Epoch 2: 100%|██████████| 15/15 [00:05<00:00,  2.83it/s]


Epoch [2/20]
Train Loss: 0.8701 | Train Acc: 0.5852
Val Loss: 0.8704 | Val Acc: 0.6031
Learning Rate: 0.000010
--------------------------------------------------
✅ Validation loss improved — model saved.


Train Epoch 3: 100%|██████████| 99/99 [00:38<00:00,  2.55it/s]
Validation Epoch 3: 100%|██████████| 15/15 [00:05<00:00,  2.88it/s]


Epoch [3/20]
Train Loss: 0.7867 | Train Acc: 0.6397
Val Loss: 0.9945 | Val Acc: 0.6186
Learning Rate: 0.000010
--------------------------------------------------
⏰ No improvement — patience 1/3


Train Epoch 4: 100%|██████████| 99/99 [00:38<00:00,  2.60it/s]
Validation Epoch 4: 100%|██████████| 15/15 [00:05<00:00,  2.97it/s]


Epoch [4/20]
Train Loss: 0.6789 | Train Acc: 0.6876
Val Loss: 1.0390 | Val Acc: 0.6164
Learning Rate: 0.000010
--------------------------------------------------
⏰ No improvement — patience 2/3


Train Epoch 5: 100%|██████████| 99/99 [00:38<00:00,  2.59it/s]
Validation Epoch 5: 100%|██████████| 15/15 [00:05<00:00,  2.86it/s]


Epoch [5/20]
Train Loss: 0.5851 | Train Acc: 0.7376
Val Loss: 1.2908 | Val Acc: 0.6208
Learning Rate: 0.000010
--------------------------------------------------
⏰ No improvement — patience 3/3
🛑 Early stopping triggered at epoch 5

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 29/29 [00:10<00:00,  2.69it/s]


📊 FINAL TEST RESULTS (Vision-Only VGG19):
Test Accuracy: 0.6330
Test Precision: 0.6878
Test Recall: 0.6330
Test F1-Score: 0.6355
Test Loss: 0.8745

Confusion Matrix:
[[212 147  43]
 [ 27 272  54]
 [  6  54  87]]

📋 Detailed Classification Report:
              precision    recall  f1-score   support

     Class 0       0.87      0.53      0.66       402
     Class 1       0.58      0.77      0.66       353
     Class 2       0.47      0.59      0.53       147

    accuracy                           0.63       902
   macro avg       0.64      0.63      0.61       902
weighted avg       0.69      0.63      0.64       902


✅ Results saved to '/kaggle/working/vgg19_results.json'
✅ Best model saved as 'best_vgg19_model.pt'



