# Import

In [3]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import skimage
from sklearn.metrics import r2_score

from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.io import read_image

from dataclasses import dataclass

from torchvision.models import resnet50, ResNet50_Weights

# Data

In [49]:
'''for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))'''

"for dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))"

In [50]:
'''from PIL import Image

image1 = Image.open("/kaggle/input/cs-480-2024-spring/data/train_images/190966811.jpeg")
image2 = Image.open("/kaggle/input/cs-480-2024-spring/data/test_images/179127153.jpeg")

print(image1.mode)
print(image1.size)
print(image2.mode)
print(image2.size)'''

'from PIL import Image\n\nimage1 = Image.open("/kaggle/input/cs-480-2024-spring/data/train_images/190966811.jpeg")\nimage2 = Image.open("/kaggle/input/cs-480-2024-spring/data/test_images/179127153.jpeg")\n\nprint(image1.mode)\nprint(image1.size)\nprint(image2.mode)\nprint(image2.size)'

In [4]:
train_data = pd.read_csv("/kaggle/input/cs-480-2024-spring/data/train.csv")
train_data = train_data.set_index('id')
validation_data = train_data.sample(frac = 0.1, random_state = 48)
train_data =  train_data.drop(validation_data.index)

train_traits = train_data.iloc[:,-6:]
train_data = train_data.iloc[:,:-6]

validation_traits = validation_data.iloc[:,-6:]
validation_data = validation_data.iloc[:,:-6]

In [52]:
'''print(len(train_data), len(validation_data))
print(train_traits.head())
print(validation_traits.head())'''

# print(train_data.index)

'print(len(train_data), len(validation_data))\nprint(train_traits.head())\nprint(validation_traits.head())'

## Data transformation

In [5]:
train_traits = np.log10(train_traits)
traits_max = train_traits.max()
traits_min = train_traits.min()
train_traits_normalized = (train_traits - traits_min) / (traits_max - traits_min)

validation_traits = np.log10(validation_traits)
validation_traits_normalized = (validation_traits - traits_min) / (traits_max - traits_min)

In [54]:
'''print(train_traits_normalized.head())
print(validation_traits_normalized.head())'''

'print(train_traits_normalized.head())\nprint(validation_traits_normalized.head())'

In [6]:
def inverse_tranformation(trait):
    return np.power(10, (trait * (traits_max - traits_min)) + traits_min)

In [56]:
'''print(train_data.head())
print(train_traits.head())

print(test_data.head())
print(test_traits.head())'''

'''print(train_traits.loc[101801795])
print(type(train_traits.loc[101801795]))'''

'print(train_traits.loc[101801795])\nprint(type(train_traits.loc[101801795]))'

In [57]:
'''print(torch.tensor(train_traits.loc[101801795].values))'''

'print(torch.tensor(train_traits.loc[101801795].values))'

In [7]:
class CustomImageSet(Dataset):
    def __init__(self, root_dir, indices, target_mapping, transform = None):
        self.root_dir = root_dir
        self.indices = indices
        self.transform = transform
        self.image_paths = []
        self.targets = []
        
        for idx in indices:
            self.image_paths.append(os.path.join(root_dir, str(idx) + ".jpeg"))
            target = self.get_label_from_filename(idx, target_mapping)
            self.targets.append(target)
                
    def get_label_from_filename(self, idx, target_mapping):
        target = target_mapping.loc[idx].values
        return torch.tensor(target).float()
        
    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        target = self.targets[idx]
 
        if self.transform:
            image = self.transform(image)

        return image, target

In [68]:
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 1.1)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

validation_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

trainset = CustomImageSet('/kaggle/input/cs-480-2024-spring/data/train_images/', train_data.index.tolist(), 
                          train_traits_normalized, transform=train_transform)
validationset = CustomImageSet('/kaggle/input/cs-480-2024-spring/data/train_images/', validation_data.index.tolist(),
                               validation_traits_normalized, transform=validation_transform)

batch_size = 256

trainloader = DataLoader(trainset, batch_size = batch_size, shuffle=True)
validationloader = DataLoader(validationset, batch_size = batch_size, shuffle=False)

In [None]:
'''for i, sample in enumerate(trainloader):
    image, label = sample[0], sample[1]
    print(image.max())
    print(image)
    print(type(image))
    print(type(label))
    print(image.dtype)
    print(image.shape)
    print(label)
    break'''

# print(len(trainloader))

# CNN

In [9]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


## Model

In [10]:
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
for param in model.parameters():
    param.requires_grad = False
    
fc_inputs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(fc_inputs, 512),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(512, 6)
)

optimizer = optim.RMSprop(model.parameters(), lr=0.0001, weight_decay = 0.0001)
criterion = nn.MSELoss()

model.to(device)
# print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

## Training

In [76]:
def train(model, device, train_loader, criterion, optimizer, epoch):
    loss_fn = criterion
    model.train()
    train_loss_total = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        train_loss_total += loss.item()
        optimizer.step()
        
        if (batch_idx + 1) % 25 == 0:
            print('[{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                (batch_idx + 1) * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

    print(train_loss_total)
    print(len(train_loader.dataset))
    train_loss = train_loss_total / (len(train_loader))

    print('Training set: Average loss: {}'.format(train_loss))
    return train_loss

def validate(model, device, validate_loader, criterion, epoch):
    loss_fn = criterion
    model.eval()
    validate_loss_total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(validate_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = loss_fn(output, target)
            validate_loss_total += loss.item()
            
        validate_loss = validate_loss_total / len(validate_loader)
        print('Validation set: Average loss: ', validate_loss)
        return validate_loss

In [77]:
num_epochs = 5
training_loss = [0 for _ in range(num_epochs)]
for epoch in range(num_epochs):
    print("Epoch: ", epoch + 1)
    print("---------------------------------------------")

    train(model, device, trainloader, criterion, optimizer, epoch + 1)
    
    validate(model, device, validationloader, criterion, epoch + 1)

    print("---------------------------------------------")

print('Finished Training')

Epoch:  1
---------------------------------------------
3.9918764382600784
34690
Training set: Average loss: 0.029352032634265283
Validation set: Average loss:  0.02130757809123572
---------------------------------------------
Epoch:  2
---------------------------------------------
3.197182321920991
34690
Training set: Average loss: 0.0235086935435367
Validation set: Average loss:  0.019821821109336966
---------------------------------------------
Epoch:  3
---------------------------------------------
3.075516374781728
34690
Training set: Average loss: 0.022614090991042116
Validation set: Average loss:  0.02124706913224038
---------------------------------------------
Epoch:  4
---------------------------------------------
2.9878260623663664
34690
Training set: Average loss: 0.021969309282105637
Validation set: Average loss:  0.01937042812214178
---------------------------------------------
Epoch:  5
---------------------------------------------
2.928253222256899
34690
Training set: A

In [78]:
torch.save(model.state_dict(), 'model_weights.pth')

# Evaluate

In [80]:
def r2score(model, device, dataloader):
    predictions = []
    targets = []
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            predictions.append(output.cpu().numpy())
            targets.append(target.cpu().numpy())
        
    
    predictions_np = np.vstack(predictions)
    targets_np = np.vstack(targets)

    # Calculate R^2 score for each column and average them
    r2_scores = r2_score(targets_np, predictions_np, multioutput='raw_values')
    average_r2_score = np.mean(r2_scores)

    return average_r2_score
            

In [81]:
r2score(model, device, validationloader)

0.04847860449086572

## Predict

In [13]:
model.load_state_dict(torch.load('/kaggle/input/weights/model_weights (1).pth'))

<All keys matched successfully>

In [24]:
class TestImageSet(Dataset):
    def __init__(self, root_dir, indices, target_mapping, transform = None):
        self.root_dir = root_dir
        self.indices = indices
        self.transform = transform
        self.image_paths = []
        self.targets = []
        
        for idx in indices:
            self.image_paths.append(os.path.join(root_dir, str(idx) + ".jpeg"))
            target = self.get_label_from_filename(idx, target_mapping)
            self.targets.append(target)
                
    def get_label_from_filename(self, idx, target_mapping):
        return int(target_mapping.loc[idx].values[0])
        
    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        target = self.targets[idx]
 
        if self.transform:
            image = self.transform(image)

        return image, target

In [25]:
test_data = pd.read_csv("/kaggle/input/cs-480-2024-spring/data/test.csv")
test_data = test_data.set_index('id')

test_data_index = pd.DataFrame(index = test_data.index)
test_data_index['id'] = test_data.index
test_data_index['id'].astype(int)
print(test_data_index)

                  id
id                  
154220505  154220505
195736552  195736552
182701773  182701773
27688500    27688500
195825045  195825045
...              ...
195615880  195615880
126224052  126224052
178518157  178518157
158746703  158746703
104965612  104965612

[6391 rows x 1 columns]


In [27]:
print(test_data_index['id'].nunique())
print(test_data.index.nunique())
print(type(test_data.index))

6391
6391
<class 'pandas.core.indexes.base.Index'>


In [28]:
test_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


testset = TestImageSet('/kaggle/input/cs-480-2024-spring/data/test_images/', test_data.index.tolist(),
                               target_mapping = test_data_index, transform=test_transform)

batch_size = 1

testloader = DataLoader(testset, batch_size = batch_size, shuffle=False)

In [33]:
columns = ['id', 'X4', 'X11', 'X18', 'X26', 'X50', 'X3112']
def predict(model, device, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            output = inverse_tranformation(output.cpu().numpy()[0])
            target = int(target.cpu().numpy()[0])
            
            new_row = pd.Series([target] + output.tolist(), index=columns)
            predictions.append(new_row)
            
    return predictions

In [34]:
predictions = predict(model, device, testloader)
df = pd.DataFrame(predictions)

In [41]:
print(df['id'].nunique())
print(df.shape)
print(df.head())
print(df['id'].duplicated())

6391
(6391, 7)
            id        X4         X11           X18          X26        X50  \
0  154220505.0  1.077854  144.453734  19706.788925  3547.374074  15.165351   
1  195736552.0  0.985520  147.918476  19700.648935  3477.637032  15.201605   
2  182701773.0  1.059450  147.505274  19700.777388  3460.157140  15.239775   
3   27688500.0  1.083412  148.103102  19702.635684  3489.402818  15.371697   
4  195825045.0  0.862874  147.449203  19699.548953  3477.990592  14.970263   

           X3112  
0  401238.939991  
1  398688.678343  
2  398401.730014  
3  399957.204733  
4  398764.565367  
0       False
1       False
2       False
3       False
4       False
        ...  
6386    False
6387    False
6388    False
6389    False
6390    False
Name: id, Length: 6391, dtype: bool


In [40]:
df.to_csv('submission.csv', index=False)