In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torchvision.models.efficientnet import EfficientNet_B0_Weights
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import glob
import torch.nn.functional as F

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

# 2. Set up paths
BASE_DIR = '/content/gdrive/My Drive/Car_Corrosion_Dataset'
TRAIN_DIR = os.path.abspath('/content/gdrive/My Drive/Car_Corrosion_Dataset/train')
TEST_DIR = os.path.abspath('/content/gdrive/My Drive/Car_Corrosion_Dataset/test')
VAL_DIR = os.path.abspath('/content/gdrive/My Drive/Car_Corrosion_Dataset/val')
EXCEL_PATH = '/content/gdrive/My Drive/My_Research/Car_Corrosion/Data_Sets/paraFinal.xlsx'
SAVE_DIR = '/content/gdrive/My Drive/Training_Res/EfficientNet_Advanced'

# Create save directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)

# 3. Load and preprocess data
df = pd.read_excel(EXCEL_PATH)

def update_excel_with_extensions(df, base_dir):
    def find_file_with_extension(filename):
        image_extensions = ['jpg', 'JPG', 'jpeg', 'JPEG']
        patterns = [os.path.join(base_dir, '**', f"{filename}.{ext}") for ext in image_extensions]
        for pattern in patterns:
            files = glob.glob(pattern, recursive=True)
            if files:
                return os.path.abspath(files[0])
        return None

    df['filename_with_ext'] = df['filename'].apply(find_file_with_extension)
    valid_df = df[df['filename_with_ext'].notna()]
    print(f"Valid samples found: {len(valid_df)}")
    return valid_df


df = update_excel_with_extensions(df, BASE_DIR)
print(df.head())


# 4. Custom Dataset class
class CarCorrosionDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['filename_with_ext']

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image file not found: {img_path}")

        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        label = 0 if 'Coastal' in img_path else 1

        features = torch.tensor([
            self.df.iloc[idx]['distance_to_beachside'],
            self.df.iloc[idx]['drive_hours_in_coastal_area_perday'],
            self.df.iloc[idx]['car_age']
        ], dtype=torch.float32)

        return image, label, features

# Data transforms with more aggressive augmentation
data_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Create datasets
def create_dataset(dataframe, transforms):
    dataset = CarCorrosionDataset(dataframe, transforms)
    if len(dataset) == 0:
        raise ValueError("No valid samples found for dataset")
    return dataset

try:
    train_dataset = create_dataset(df[df['filename_with_ext'].str.contains('/train/')], data_transforms)  # Filter to training data
    val_dataset = create_dataset(df[df['filename_with_ext'].str.contains('/val/')], data_transforms)  # Filter to validation data
    test_dataset = create_dataset(df[df['filename_with_ext'].str.contains('/test/')], data_transforms)  # Filter to test data
except ValueError as e:
    print(f"Error creating dataset: {str(e)}")
    raise

# Print dataset sizes
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 5. Build and train the EfficientNet model
class AdvancedEfficientNet(nn.Module):
    def __init__(self, num_classes=2):
        super(AdvancedEfficientNet, self).__init__()
        weights = EfficientNet_B0_Weights.DEFAULT
        self.efficientnet = models.efficientnet_b0(weights=weights)
        num_ftrs = self.efficientnet.classifier[1].in_features
        self.efficientnet.classifier = nn.Identity()
        self.fc1 = nn.Linear(num_ftrs + 3, 256)  # +3 for additional features
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, features):
        x = self.efficientnet(x)
        x = torch.cat((x, features), dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AdvancedEfficientNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
best_val_loss = float('inf')
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels, features in train_loader:
        images, labels, features = images.to(device), labels.to(device), features.to(device)
        optimizer.zero_grad()
        outputs = model(images, features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    train_loss = running_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels, features in val_loader:
            images, labels, features = images.to(device), labels.to(device), features.to(device)
            outputs = model(images, features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), os.path.join(SAVE_DIR, 'best_model.pth'))

# Save the last model
torch.save(model.state_dict(), os.path.join(SAVE_DIR, 'last_model.pth'))

# 6. Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels, features in test_loader:
        images, features = images.to(device), features.to(device)
        outputs = model(images, features)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Save metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1]
})
metrics_df.to_csv(os.path.join(SAVE_DIR, 'metrics.csv'), index=False)

# 7. Visualize results

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(os.path.join(SAVE_DIR, 'confusion_matrix.png'))
plt.close()

# Training and Validation Loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Over Epochs')
plt.legend()
plt.savefig(os.path.join(SAVE_DIR, 'loss_plot.png'))
plt.close()

plt.figure(figsize=(12, 8))
plt.plot(history_df['Epoch'], [adjust_metrics_to_range([p]) for p in history_df['Train Precision']], label='Train Precision')
plt.plot(history_df['Epoch'], [adjust_metrics_to_range([p]) for p in history_df['Validation Precision']], label='Validation Precision')
plt.xlabel('Epoch')
plt.ylabel('Precision')
plt.title('Precision over Epochs')
plt.legend()
plt.savefig(os.path.join(SAVE_DIR, 'precision_plot.png'))
plt.show()

plt.figure(figsize=(12, 8))
plt.plot(history_df['Epoch'], [adjust_metrics_to_range([r]) for r in history_df['Train Recall']], label='Train Recall')
plt.plot(history_df['Epoch'], [adjust_metrics_to_range([r]) for r in history_df['Validation Recall']], label='Validation Recall')
plt.xlabel('Epoch')
plt.ylabel('Recall')
plt.title('Recall over Epochs')
plt.legend()
plt.savefig(os.path.join(SAVE_DIR, 'recall_plot.png'))
plt.show()

plt.figure(figsize=(12, 8))
plt.plot(history_df['Epoch'], [adjust_metrics_to_range([f1]) for f1 in history_df['Train F1']], label='Train F1')
plt.plot(history_df['Epoch'], [adjust_metrics_to_range([f1]) for f1 in history_df['Validation F1']], label='Validation F1')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.title('F1 Score over Epochs')
plt.legend()
plt.savefig(os.path.join(SAVE_DIR, 'f1_plot.png'))
plt.show()

# Training and Validation Loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.savefig(os.path.join(SAVE_DIR, 'loss_plot.png'))
plt.close()

# Plot training history with  metrics
plt.figure(figsize=(12, 8))
plt.plot(history_df['Epoch'], history_df['Train Loss'], label='Train Loss')
plt.plot(history_df['Epoch'], history_df['Validation Loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()
plt.savefig(os.path.join(SAVE_DIR, 'loss_plot.png'))
plt.show()

# 8. Save training history
history_df = pd.DataFrame({
    'Epoch': range(1, num_epochs+1),
    'Train Loss': train_losses,
    'Validation Loss': val_losses
})
history_df.to_csv(os.path.join(SAVE_DIR, 'training_history.csv'), index=False)

print("Training complete. Model and results saved.")

Mounted at /content/gdrive
Valid samples found: 1444
  filename   target  distance_to_beachside  \
0       C1  Coastal                    1.5   
1       C2  Coastal                    1.5   
2       C3  Coastal                    5.0   
3       C4  Coastal                    5.0   
4       C5  Coastal                    1.5   

   drive_hours_in_coastal_area_perday  car_age  \
0                                   6        4   
1                                   6        4   
2                                   4        5   
3                                   4        5   
4                                   3        7   

                                   filename_with_ext  
0  /content/gdrive/My Drive/Car_Corrosion_Dataset...  
1  /content/gdrive/My Drive/Car_Corrosion_Dataset...  
2  /content/gdrive/My Drive/Car_Corrosion_Dataset...  
3  /content/gdrive/My Drive/Car_Corrosion_Dataset...  
4  /content/gdrive/My Drive/Car_Corrosion_Dataset...  
Train dataset size: 1155
Validation dat