Import Necessary Library

In [None]:
import torch
import torch.nn as nn
from torchvision import transforms
import pandas as pd
from PIL import Image
import os
from transformers import AutoImageProcessor, AutoModel
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

Preprocess Ancillary Data

In [None]:
# Load the CSV files
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Separate the 'id' column
x_train_id = train_df.iloc[:, 0]  # First column as 'id'
x_train_features = train_df.iloc[:, 1:-6]  # All columns except the first and the last 6 columns

x_test_id = test_df.iloc[:, 0]  # First column as 'id'
x_test_features = test_df.iloc[:, 1:]  # All columns except the first

min_train = x_train_features.min()
max_train = x_train_features.max()
x_train_norm = (x_train_features - min_train) / (max_train - min_train)

min_test = x_test_features.min()
max_test = x_test_features.max()
x_test_norm = (x_test_features - min_test) / (max_test - min_test)

x_train = pd.concat([x_train_id, x_train_norm], axis=1)
x_test = pd.concat([x_test_id, x_test_norm], axis=1)

y_train = train_df.iloc[:, -6:]
y_label = y_train.columns

Model

In [None]:
# # Load Pretrained ViT Model
model_name = "facebook/dinov2-giant"
feature_extractor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

Extract Features Helpers

In [None]:
# Function to extract features from an image
def extract_dino_features(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Obtain the CLS token
    features = outputs.last_hidden_state[:, 0].cpu().numpy()
    return features

Extract Features From Train Images Using ViT

In [None]:
# Path to the image folder
train_image_folder = 'data/train_images'  # Update the path to your image folder

# Extract features for all images and store them in a dictionary with IDs
train_image_features_dict = {}
for image_file in os.listdir(train_image_folder):
    image_id = os.path.splitext(image_file)[0]  # image file name is the ID
    image_path = os.path.join(train_image_folder, image_file)
    features = extract_dino_features(image_path)
    train_image_features_dict[image_id] = features

Extract Features From Test Images Using ViT

In [None]:
# Path to the image folder
test_image_folder = 'data/test_images'  # Update the path to your image folder

# Extract features for all images and store them in a dictionary with IDs
test_image_features_dict = {}
for image_file in os.listdir(test_image_folder):
    image_id = os.path.splitext(image_file)[0]  # image file name is the ID
    image_path = os.path.join(test_image_folder, image_file)
    features = extract_dino_features(image_path)
    test_image_features_dict[image_id] = features

Combined Extracted Features with Ancillary Data for Training Data

In [None]:
# Prepare tensors for combined features and labels
train_combined_features_list = []

for _, row in x_train.iterrows():
    image_id = str(int(row['id']))  # Ensure the ID is in string format if necessary
    if image_id in train_image_features_dict:
        image_features = train_image_features_dict[image_id].squeeze()
        ancillary_features = row.drop(labels=['id']).values
        combined_features = np.concatenate((image_features, ancillary_features), axis=0)
        train_combined_features_list.append(combined_features)

train_combined_features_array = np.array(train_combined_features_list)

Combined Extracted Features with Ancillary Data for Testing Data

In [None]:
# Prepare tensors for combined features and labels
test_combined_features_list = []
test_ids = []  # List to store IDs

for _, row in x_test.iterrows():
    image_id = str(int(row['id']))  # Ensure the ID is in string format if necessary
    if image_id in test_image_features_dict:
        image_features = test_image_features_dict[image_id].squeeze()
        ancillary_features = row.drop(labels=['id']).values
        combined_features = np.concatenate((image_features, ancillary_features), axis=0)
        test_combined_features_list.append(combined_features)
        test_ids.append(image_id)

test_combined_features_array = np.array(test_combined_features_list)

Preprocess the Combined Data For Model Training

In [None]:
# Log transformation of targets
y_train_log = np.log10(y_train)

# Remove outliers (values beyond three standard deviations)
# mean = y_train_log.mean()
# std = y_train_log.std()
# mask = (y_train_log >= mean - 3*std) & (y_train_log <= mean + 3*std)
# y_train_log = y_train_log[mask.all(axis=1)]
# # update the data by removing some outliers
# x_combined_train = train_combined_features_array[mask.all(axis=1)]

# Normalization targets
min_train = y_train_log.min()
max_train = y_train_log.max()
y_train_norm = (y_train_log - min_train) / (max_train - min_train)

# Split the training data into training and validation sets (20%)
x_train_split, x_val, y_train_split, y_val = train_test_split(train_combined_features_array, y_train_norm, test_size=0.2, random_state=42)

x_train = x_train_split
y_train = y_train_split.values
x_val = x_val
y_val = y_val.values
x_test = test_combined_features_array


Train the Data

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    eta=0.05,
    max_depth=8,
    n_estimators=1000,
    tree_method='hist',
    device="cuda",
    early_stopping_rounds = 10,
    reg_alpha=0.2,
    reg_lambda=1.2
)

# Fit the model on the training data
model.fit(x_train, y_train, eval_set=[(x_val, y_val)])

Produce Predictions for Test Data

In [None]:
predictions = model.predict(x_test)

# Convert predictions to original scale
# predictions = predictions.cpu().numpy()
max_train = np.array(max_train).reshape(1, 6)
min_train = np.array(min_train).reshape(1, 6)
predictions_original_scale = (predictions * (max_train - min_train)) + min_train

predictions_original_scale = 10 ** predictions_original_scale

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions_original_scale, columns=y_label)


predictions_df.insert(0, 'id', test_ids)  # Insert the IDs as the first column
# Save predictions to CSV
predictions_df.to_csv('cs480-kaggle.csv', index=False)

print("Predictions have been saved to 'cs480-kaggle.csv'.")