In [4]:
import os
import joblib
from PIL import Image
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
from sklearn.preprocessing import MinMaxScaler 
from sklearn.metrics import r2_score

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Research Questions

- Strip outliers? Aux only, or Aux + labels
- Types of regularization RobustScaler vs MinMaxScaler
- Augment data 
- Aux Data
- Image transformations ( Flip, rotate, blur, colorshift)
- Freezing Layers vs not
- resnet weights
- Which pretrained model to use (Inception-Resnet-V2, Xception, MobileNetV2)
- 6 models vs 1 model
- Aux 6 vs 163 aux data
- Multiple Heads
- Avgpool
- Eval mode model.eval()


In [6]:
device = None
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
    print("Using GPU (CUDA)")
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS (for macOS with Apple Silicon)
    print("Using MPS (Metal Performance Shaders)")
else:
    device = torch.device("cpu")  # Use CPU
    print("Using CPU")

assert(device is not None)

Using MPS (Metal Performance Shaders)


In [7]:
VERSION = "17"
SUBVERSION=""
METADATA_PATH = "./metadata"
AUXSCALER_PATH = f"{METADATA_PATH}/auxScaler{VERSION}.pkl"
LABELSCALER_PATH = f"{METADATA_PATH}/labelScaler{VERSION}.pkl"
MODEL_PATH = f"{METADATA_PATH}/model{VERSION}{SUBVERSION}.pth"
FIG_PATH = f"{METADATA_PATH}/fig{VERSION}{SUBVERSION}.png"
SUBMISSION_PATH = f"{METADATA_PATH}/submission{VERSION}{SUBVERSION}.csv"
TRAIN_AUX_PATH = "./data-3/train.csv"
TRAIN_IMAGE_PATH = "./data-3/train_images"
TEST_AUX_PATH = "./data-3/test.csv"
TEST_IMAGE_PATH = "./data-3/test_images"

TRAIN_DATA_RATIO = 0.8

# print(*zip(range(500),df.columns),sep="\n")
AUX_START = 1
# 1-6 for WORLDCLIM
# 7-67 for SOIL
# 68-127 for MODIS 
# 128-163 for VOD
AUX_END = 164 #7
AUX_CATEGORIES = AUX_END - AUX_START 

# Labels 
# (164, 'X4_mean')
# (165, 'X11_mean')
# (166, 'X18_mean')
# (167, 'X26_mean')
# (168, 'X50_mean')
# (169, 'X3112_mean')

LABLES_START = 164
LABELS_END = 170
LABELS_CATEGORIES = LABELS_END - LABLES_START

BATCH_SIZE = 10

In [8]:
auxScaler = MinMaxScaler((0,1))
labelScaler = MinMaxScaler((0,1))

def addGaussianNoise(mean=0.0, std=0.1):
    return lambda tensor: tensor + torch.normal(mean, std, size=tensor.size())

image_transform = transforms.Compose([
    # transforms.RandomHorizontalFlip(),
    # transforms.RandomVerticalFlip(),
    # transforms.RandomAffine(
    #     degrees=30,                   # Rotation
    #     translate=(0.1, 0.1),         # Translation
    #     scale=(0.8, 1.2),             # Scaling
    #     shear=(0, 20)                 # Shearing
    # ),
    # transforms.RandomResizedCrop(96),
    # transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    # addGaussianNoise()
])

test_image_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

input_transform = transforms.Compose([
    addGaussianNoise(0, 0.05)
])

class PlantDataset(Dataset):
    def __init__(self, X_ids, X_aux, Y, input_transform, imgMap, transform):
        self.X_ids = torch.tensor(X_ids)
        self.X_aux = torch.tensor(X_aux, dtype=torch.float32)
        self.Y = torch.tensor(Y, dtype=torch.float32)
        self.imgIdMap = imgMap
        self.transform = transform
        self.input_transform = input_transform

    def __len__(self):
        return len(self.X_ids)

    def __getitem__(self, idx):
        x_id = self.X_ids[idx].item()
        x_img = self.imgIdMap[str(x_id)]
        x_img = self.transform(x_img)

        x_aux = self.X_aux[idx]
        x_aux = self.input_transform(x_aux)

        y = self.Y[idx]

        return x_id, x_img, x_aux, y

def findOutliers(df):
    zdf = np.abs((df - df.mean())/ df.std())
    outlier_criteria = zdf < 3
    return outlier_criteria.all(axis=1)

def preprocessData(filename, handleOutliers=True):
    # Load Data
    df = pd.read_csv(filename)

    if handleOutliers:
        # Remove Outliers
        previousCount = df.shape[0]
        outlierCheckData = df.iloc[:, AUX_START:LABELS_END] # AUX_END, or LABELS_END?
        outliers = findOutliers(outlierCheckData)
        df = df[outliers]
        currentCount = df.shape[0]
        print(f"Removed {-currentCount + previousCount} Outliers! ({previousCount} to {currentCount})")

    imageIds = df[['id']].values.squeeze()
    auxData = df.iloc[:, AUX_START:AUX_END]
    labels = df.iloc[:, LABLES_START:LABELS_END]

    return imageIds, auxData, labels


In [9]:
imageIds, auxData, labels = preprocessData(TRAIN_AUX_PATH, True)
# auxData = auxScaler.fit_transform(auxData)
# labels = labelScaler.fit_transform(labels)

joblib.dump(auxScaler, AUXSCALER_PATH)
joblib.dump(labelScaler, LABELSCALER_PATH)



Removed 12522 Outliers! (43363 to 30841)


['./metadata/labelScaler17.pkl']

In [20]:
import random
X_train, X_test, y_train, y_test = train_test_split(auxData, labels, test_size=0.2)
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)

# Train model
model.fit(X_train, y_train)

pred = model.predict(X_test)
r2_score(y_test, pred)

0.19953566772749096

In [19]:
import csv

timageIds, tauxData, tlabels = preprocessData(TEST_AUX_PATH, False)
tauxData = auxScaler.transform(tauxData)


with open(SUBMISSION_PATH, "w") as file:
    columns=['id', 'X4', 'X11', 'X18', 'X26', 'X50', 'X3112']
    csvwriter = csv.writer(file)
    csvwriter.writerow(columns)
    
    predictions = model.predict(tauxData)
    predictions = labelScaler.inverse_transform(predictions)

    data = []
    for i,p  in zip(timageIds, predictions):
        data.append([i.item()] + list(p))
        # print(data)
        # break
    csvwriter.writerows(data)