<a href="https://colab.research.google.com/github/Rajat-Jamblekar/HackathonProject/blob/main/ImageClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
import torchvision
from torchvision import transforms, models, datasets
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import os

# Define path to your dataset
data_dir = '/content/sample_data/train/'
train_csv_path = os.path.join(data_dir, 'train.csv')
images_dir = os.path.join(data_dir, 'images')

# Read CSV file /content/sample_data/train/train.csv /content/sample_data/train/images
df = pd.read_csv(train_csv_path)

# # Split dataset into train and validation
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)



In [3]:
!unzip '/content/drive/MyDrive/images/train.zip' -d '/content/sample_data/'


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/sample_data/train/images/7506.jpg  
  inflating: /content/sample_data/train/images/697.jpg  
  inflating: /content/sample_data/train/images/7245.jpg  
  inflating: /content/sample_data/train/images/7418.jpg  
  inflating: /content/sample_data/train/images/7503.jpg  
  inflating: /content/sample_data/train/images/2864.jpg  
  inflating: /content/sample_data/train/images/917.jpg  
  inflating: /content/sample_data/train/images/3279.jpg  
  inflating: /content/sample_data/train/images/1886.jpg  
  inflating: /content/sample_data/train/images/6247.jpg  
  inflating: /content/sample_data/train/images/4318.jpg  
  inflating: /content/sample_data/train/images/189.jpg  
  inflating: /content/sample_data/train/images/2967.jpg  
  inflating: /content/sample_data/train/images/7509.jpg  
  inflating: /content/sample_data/train/images/7676.jpg  
  inflating: /content/sample_data/train/images/3964.jpg  
  inflatin

In [5]:
# Data transformation and augmentation
from torchvision.transforms import RandomRotation, RandomResizedCrop, ColorJitter

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),  # Rotates the image by up to 15 degrees
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),  # Randomly changes image brightness, contrast, saturation, and hue
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

from PIL import Image

class CustomDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        while True:
            try:
                img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx, 1])
                image = Image.open(img_name)  # Open the image with PIL
                image = image.convert('RGB')  # Convert to RGB if not already

                if self.transform:
                    image = self.transform(image)  # Apply transformations

                label = int(self.dataframe.iloc[idx, 2])
                return image, label

            except FileNotFoundError:
                print(f"Warning: File {img_name} not found, skipping.")
                idx = (idx + 1) % len(self.dataframe)
                continue
            except Exception as e:
                print(f"Error opening file {img_name}: {e}, skipping.")
                idx = (idx + 1) % len(self.dataframe)
                continue



# Initialize datasets
train_dataset = CustomDataset(dataframe=train_df, image_dir=images_dir, transform=data_transforms['train'])
val_dataset = CustomDataset(dataframe=val_df, image_dir=images_dir, transform=data_transforms['val'])

# Initialize dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [6]:
# Load a pre-trained ResNet model
model = models.resnet50(pretrained=True)

# Modify the classifier for binary classification
num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, 2)  # 2 classes: fraudulent or non-fraudulent

# Move model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 101MB/s]


In [7]:
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['label']), y=train_df['label'])
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# Loss function with class weights
criterion = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Initialize the learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)


In [9]:
num_epochs = 4
best_val_loss = float('inf')
epochs_no_improve = 0
n_epochs_stop = 2  # Number of epochs to wait before stopping after detecting no improvement
early_stop = False

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():  # In validation phase, we don't need to compute gradients
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    train_loss = train_loss / len(train_loader)
    val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total

    # Print epoch summary
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= n_epochs_stop:
            print('Early stopping!')
            early_stop = True
            break

    # Scheduler step
    scheduler.step(val_loss)

    if early_stop:
        print("Stopped. Best validation loss: {:.4f}".format(best_val_loss))
        break



# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

Epoch 1/4
Train Loss: 0.7663, Validation Loss: 75.2600, Validation Accuracy: 94.25%
Epoch 2/4
Train Loss: 0.6914, Validation Loss: 0.6431, Validation Accuracy: 72.03%
Epoch 3/4
Train Loss: 0.6474, Validation Loss: 0.6060, Validation Accuracy: 84.53%
Epoch 4/4
Train Loss: 0.6277, Validation Loss: 0.6309, Validation Accuracy: 86.39%


<All keys matched successfully>

In [32]:
# # filepath = '/content/sample_data/images/'
# import os

# if os.path.exists("/content/sample_data/images/6146.jpg"):
#     print("The file exists.")
# else:
#     print("The file does not exist.")

The file does not exist.


In [7]:
# !nvidia-smi

In [8]:
!unzip '/content/drive/MyDrive/test.zip' -d '/content/test_data/'

Archive:  /content/drive/MyDrive/test.zip
   creating: /content/test_data/test/
   creating: /content/test_data/test/images/
  inflating: /content/test_data/test/images/10669.jpg  
  inflating: /content/test_data/test/images/9743.jpg  
  inflating: /content/test_data/test/images/8443.jpg  
  inflating: /content/test_data/test/images/8141.jpg  
  inflating: /content/test_data/test/images/8389.jpg  
  inflating: /content/test_data/test/images/9711.jpg  
  inflating: /content/test_data/test/images/8946.jpg  
  inflating: /content/test_data/test/images/8286.jpg  
  inflating: /content/test_data/test/images/8809.jpg  
  inflating: /content/test_data/test/images/8295.jpg  
  inflating: /content/test_data/test/images/10893.jpg  
  inflating: /content/test_data/test/images/10182.jpg  
  inflating: /content/test_data/test/images/9994.jpg  
  inflating: /content/test_data/test/images/8643.jpg  
  inflating: /content/test_data/test/images/10012.jpg  
  inflating: /content/test_data/test/images/10

In [10]:
class TestDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.dataframe = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Use the 'filename' column to get the image file
        img_filename = self.dataframe.iloc[idx, 1]  # Assuming the second column is 'filename'
        img_path = os.path.join(self.img_dir, img_filename)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image

# Assuming the directory structure is /content/test_data/images after unzipping
test_img_dir = '/content/test_data/test/images'  # Adjust if necessary
test_csv_file = '/content/test_data/test/test.csv'  # Adjust if necessary
test_dataset = TestDataset(csv_file=test_csv_file, img_dir=test_img_dir, transform=data_transforms['val'])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)  # Adjust batch size as per your requirement


In [11]:
model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())

# `predictions` now contains the predicted labels for your test dataset


In [14]:
# # filepath = '/content/sample_data/images/'
# import os

# if os.path.exists("/content/test_data/test/images/8080.jpg"):
#     print("The file exists.")
# else:
#     print("The file does not exist.")

The file exists.


In [12]:
# Example: Saving predictions to a CSV file
import pandas as pd

submission_df = pd.DataFrame({
    'image_id': test_dataset.dataframe.iloc[:, 0],  # Assuming the first column contains image IDs
    'label': predictions
})

submission_df.to_csv('/content/sample_data/test_predictions_new.csv', index=False)
