In [17]:
# Non Neural Network model:   XGboost


import os
import pandas as pd
import numpy as np

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision import transforms
#Read in training and test tables
DATA_DIR = "data"

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Test:", test_df.shape)

# Check for missing values
assert train_df.isna().sum().sum() == 0, "Missing in train!"
assert test_df .isna().sum().sum() == 0, "Missing in test!"

# Separate IDs and labels
train_ids = train_df["id"].values
train_labels = train_df["y"].values.astype(np.int64) - 1  ######### zero-based for PyTorch
train_df = train_df.drop(columns=["id", "y"])
test_ids = test_df["id"].values
test_df = test_df.drop(columns=["id"])

# Normalize pixel intensities(it is already in [0,1] but we cast to float32)
train_pixels = train_df.values.astype(np.float32)
test_pixels  = test_df.values .astype(np.float32)

#  Each row is [r3132, g0101, b0101], so we reshape to (N, 3, 32, 32)
def to_image_array(flat_array):
    # shape (N, 3072) → (N, 3, 32, 32)
    return flat_array.reshape(-1, 3, 32, 32)

X_train = to_image_array(train_pixels)
X_test  = to_image_array(test_pixels)

class FarmImageDataset(Dataset):
    def __init__(self, images, labels=None):
        self.images = torch.from_numpy(images)      ########### float32 tensor
        self.labels = None if labels is None else torch.from_numpy(labels)
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        x = self.images[idx]
        if self.labels is None:
            return x
        y = self.labels[idx]
        return x, y

# Split train into train/validation

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, train_labels, test_size=0.2, stratify=train_labels, random_state=42
)

train_ds = FarmImageDataset(X_tr, y_tr)
val_ds   = FarmImageDataset(X_val, y_val)
test_ds  = FarmImageDataset(X_test)

batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size)
test_loader  = DataLoader(test_ds,  batch_size=batch_size)



FileNotFoundError: [Errno 2] No such file or directory: 'data\\train.csv'

In [None]:


import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Train/Validation split 
X_tr, X_val, y_tr, y_val = train_test_split(
    train_pixels, train_labels,
    test_size=0.2, stratify=train_labels, random_state=42
)

#  Convert to DMatrix
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dval   = xgb.DMatrix(X_val, label=y_val)
dtest  = xgb.DMatrix(test_pixels)

# Set parameters
params = {
    'objective':      'multi:softmax',   # for classification
    'num_class':      3,                 # three classes
    'eval_metric':    'merror',          # multiclass error rate
    'learning_rate':  0.1,
    'max_depth':      6,
    'subsample':      0.8,
    'colsample_bytree': 0.8,
    'seed':           42
}

#Train with early stopping
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dval, 'eval')],
    early_stopping_rounds=10,
    verbose_eval=10
)

# 6. Validation accuracy
val_preds = bst.predict(dval).astype(int)   # returns 0,1,2
val_acc = accuracy_score(y_val, val_preds)
print(f"XGBoost validation accuracy: {val_acc*100:.2f}%")

# 7. Predict on test set and save submission
test_preds = bst.predict(dtest).astype(int) + 1   # back to labels 1–3

submission_xgb = pd.DataFrame({
    'id':   test_ids,
    'y':    test_preds
})
submission_xgb.to_csv('xgb.csv', index=False)
print("Saved xgb.csv with", len(submission_xgb), "rows.")


[0]	train-merror:0.08750	eval-merror:0.29583
[10]	train-merror:0.00104	eval-merror:0.20833
[20]	train-merror:0.00104	eval-merror:0.18750
[30]	train-merror:0.00000	eval-merror:0.17083
[40]	train-merror:0.00000	eval-merror:0.17500
XGBoost validation accuracy: 82.92%
Saved xgb.csv with 1200 rows.
