In [23]:

%pip install pandas numpy scikit-learn matplotlib seaborn tensorflow keras xgboost


ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for tensorflow


Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Note: you may need to restart the kernel to use updated packages.


In [47]:
# In your .ipynb, start with necessary imports
import os
import pandas as pd
import numpy as np

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from torchvision import transforms

In [29]:
# 1.a. Point to the data folder
DATA_DIR = "data"

# 1.b. Read in training and test tables
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Test:", test_df.shape)


Train: (1200, 3074) Test: (1200, 3073)


In [30]:
# 2.a. Check for missing values
assert train_df.isna().sum().sum() == 0, "Missing in train!"
assert test_df .isna().sum().sum() == 0, "Missing in test!"

# 2.b. Separate IDs & labels
train_ids = train_df["id"].values
train_labels = train_df["y"].values.astype(np.int64) - 1  # zero-based for PyTorch
train_df = train_df.drop(columns=["id", "y"])
test_ids = test_df["id"].values
test_df = test_df.drop(columns=["id"])

# 2.c. Normalize pixel intensities (already in [0,1], but cast to float32)
train_pixels = train_df.values.astype(np.float32)
test_pixels  = test_df.values .astype(np.float32)


In [31]:
# 3.a. Each row is [r0101 … r3132, g0101 …, b0101 …], so reshape to (N, 3, 32, 32)
def to_image_array(flat_array):
    # shape (N, 3072) → (N, 3, 32, 32)
    return flat_array.reshape(-1, 3, 32, 32)

X_train = to_image_array(train_pixels)
X_test  = to_image_array(test_pixels)


In [32]:
class FarmImageDataset(Dataset):
    def __init__(self, images, labels=None):
        self.images = torch.from_numpy(images)      # float32 tensor
        self.labels = None if labels is None else torch.from_numpy(labels)
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        x = self.images[idx]
        if self.labels is None:
            return x
        y = self.labels[idx]
        return x, y

# Split train into train/validation
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, train_labels, test_size=0.2, stratify=train_labels, random_state=42
)

train_ds = FarmImageDataset(X_tr, y_tr)
val_ds   = FarmImageDataset(X_val, y_val)
test_ds  = FarmImageDataset(X_test)

batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size)
test_loader  = DataLoader(test_ds,  batch_size=batch_size)


In [33]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # → (32,32,32)
            nn.ReLU(),
            nn.MaxPool2d(2),                              # → (32,16,16)
            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # → (64,16,16)
            nn.ReLU(),
            nn.MaxPool2d(2),                              # → (64,8,8)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),                                 # → (64*8*8 = 4096)
            nn.Linear(64*8*8, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.classifier(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN().to(device)


In [38]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def train_epoch(loader):
    model.train()
    total_loss, total_correct = 0, 0
    for Xb, yb in loader:
        Xb, yb = Xb.to(device), yb.to(device)
        preds = model(Xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        total_loss += loss.item() * Xb.size(0)
        total_correct += (preds.argmax(1) == yb).sum().item()
    return total_loss/len(loader.dataset), total_correct/len(loader.dataset)

def eval_epoch(loader):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            preds = model(Xb)
            total_loss += criterion(preds, yb).item() * Xb.size(0)
            total_correct += (preds.argmax(1) == yb).sum().item()
    return total_loss/len(loader.dataset), total_correct/len(loader.dataset)

best_val_acc = 0
for epoch in range(1, 26):
    train_loss, train_acc = train_epoch(train_loader)
    val_loss, val_acc     = eval_epoch(val_loader)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_cnn.pth")
    print(f"Epoch {epoch:02d} – train_acc: {train_acc:.3f}, val_acc: {val_acc:.3f}")


Epoch 01 – train_acc: 0.968, val_acc: 0.954
Epoch 02 – train_acc: 0.985, val_acc: 0.929
Epoch 03 – train_acc: 0.998, val_acc: 0.967
Epoch 04 – train_acc: 1.000, val_acc: 0.954
Epoch 05 – train_acc: 0.998, val_acc: 0.958
Epoch 06 – train_acc: 1.000, val_acc: 0.958
Epoch 07 – train_acc: 1.000, val_acc: 0.958
Epoch 08 – train_acc: 1.000, val_acc: 0.958
Epoch 09 – train_acc: 1.000, val_acc: 0.958
Epoch 10 – train_acc: 1.000, val_acc: 0.963
Epoch 11 – train_acc: 0.999, val_acc: 0.958
Epoch 12 – train_acc: 1.000, val_acc: 0.954
Epoch 13 – train_acc: 1.000, val_acc: 0.958
Epoch 14 – train_acc: 1.000, val_acc: 0.963
Epoch 15 – train_acc: 1.000, val_acc: 0.954
Epoch 16 – train_acc: 1.000, val_acc: 0.963
Epoch 17 – train_acc: 1.000, val_acc: 0.963
Epoch 18 – train_acc: 0.999, val_acc: 0.963
Epoch 19 – train_acc: 1.000, val_acc: 0.958
Epoch 20 – train_acc: 1.000, val_acc: 0.950
Epoch 21 – train_acc: 1.000, val_acc: 0.967
Epoch 22 – train_acc: 1.000, val_acc: 0.963
Epoch 23 – train_acc: 1.000, val

In [44]:
# 7.a. Load best model
model.load_state_dict(torch.load("best_cnn.pth"))
model.eval()

# 7.b. Predict on test set
all_preds = []
with torch.no_grad():
    for Xb in test_loader:
        Xb = Xb.to(device)
        preds = model(Xb).argmax(1).cpu().numpy() + 1  # back to 1–3 labels
        all_preds.append(preds)
all_preds = np.concatenate(all_preds)

# 7.c. Write submission CSV
submission = pd.DataFrame({"id": test_ids, "y": all_preds})
submission.to_csv("cnn_2.csv", index=False)
print("cnn_2.csv with", len(submission), "rows.")


cnn_2.csv with 1200 rows.


In [43]:
# ... your training loop ...

best_val_acc = 0
for epoch in range(1, 21):
    train_loss, train_acc = train_epoch(train_loader)
    val_loss, val_acc     = eval_epoch(val_loader)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_cnn.pth")
    print(f"Epoch {epoch:02d} – train_acc: {train_acc:.3f}, val_acc: {val_acc:.3f}")

# === ADD THIS AT THE END ===
print(f"\nBest validation accuracy achieved: {best_val_acc*100:.2f}%")


Epoch 01 – train_acc: 0.982, val_acc: 0.958
Epoch 02 – train_acc: 0.974, val_acc: 0.963
Epoch 03 – train_acc: 0.978, val_acc: 0.954
Epoch 04 – train_acc: 0.977, val_acc: 0.967
Epoch 05 – train_acc: 0.973, val_acc: 0.950
Epoch 06 – train_acc: 0.979, val_acc: 0.958
Epoch 07 – train_acc: 0.986, val_acc: 0.942
Epoch 08 – train_acc: 0.983, val_acc: 0.938
Epoch 09 – train_acc: 0.983, val_acc: 0.950
Epoch 10 – train_acc: 0.990, val_acc: 0.942
Epoch 11 – train_acc: 0.992, val_acc: 0.942
Epoch 12 – train_acc: 0.990, val_acc: 0.967
Epoch 13 – train_acc: 0.994, val_acc: 0.946
Epoch 14 – train_acc: 0.994, val_acc: 0.963
Epoch 15 – train_acc: 0.994, val_acc: 0.967
Epoch 16 – train_acc: 0.997, val_acc: 0.912
Epoch 17 – train_acc: 0.991, val_acc: 0.958
Epoch 18 – train_acc: 0.998, val_acc: 0.958
Epoch 19 – train_acc: 0.999, val_acc: 0.958
Epoch 20 – train_acc: 0.997, val_acc: 0.958

Best validation accuracy achieved: 96.67%


In [42]:
# Load best‐weights and re‐eval
model.load_state_dict(torch.load("best_cnn.pth"))
val_loss, val_acc = eval_epoch(val_loader)
print(f"Final validation accuracy (re-loaded best model): {val_acc*100:.2f}%")


Final validation accuracy (re-loaded best model): 97.08%


In [45]:

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# 2. Train/Val split (reusing train_pixels, train_labels from before)
X_tr, X_val, y_tr, y_val = train_test_split(
    train_pixels, train_labels,
    test_size=0.2, stratify=train_labels, random_state=42
)

# 3. Convert to DMatrix
dtrain = xgb.DMatrix(X_tr, label=y_tr)
dval   = xgb.DMatrix(X_val, label=y_val)
dtest  = xgb.DMatrix(test_pixels)

# 4. Set parameters
params = {
    'objective':      'multi:softmax',   # for classification
    'num_class':      3,                 # three classes
    'eval_metric':    'merror',          # multiclass error rate
    'learning_rate':  0.1,
    'max_depth':      6,
    'subsample':      0.8,
    'colsample_bytree': 0.8,
    'seed':           42
}

# 5. Train with early stopping
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dval, 'eval')],
    early_stopping_rounds=10,
    verbose_eval=10
)

# 6. Validation accuracy
val_preds = bst.predict(dval).astype(int)   # returns 0,1,2
val_acc = accuracy_score(y_val, val_preds)
print(f"XGBoost validation accuracy: {val_acc*100:.2f}%")

# 7. Predict on test set and save submission
test_preds = bst.predict(dtest).astype(int) + 1   # back to labels 1–3

submission_xgb = pd.DataFrame({
    'id':   test_ids,
    'y':    test_preds
})
submission_xgb.to_csv('xgb.csv', index=False)
print("Saved xgb.csv with", len(submission_xgb), "rows.")


[0]	train-merror:0.08750	eval-merror:0.29583
[10]	train-merror:0.00104	eval-merror:0.20833
[20]	train-merror:0.00104	eval-merror:0.18750
[30]	train-merror:0.00000	eval-merror:0.17083
[40]	train-merror:0.00000	eval-merror:0.17500
XGBoost validation accuracy: 82.92%
Saved xgb.csv with 1200 rows.
