#Env setup

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingClassifier
from pathlib import Path
from utils import *
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
root_dir =  Path("/content/drive/MyDrive/281_final_project/plane_classification_data/Training")

#Load data

In [6]:
def get_images(root_dir):
  exts  = {".jpg"}
  image_paths, labels = [], []
  for p in root_dir.rglob("*"):
      if p.suffix.lower() in exts:
          image_paths.append(p)
          labels.append(p.parent.name)     # "no tumor", "pituitary", ...
  return image_paths, labels

image_paths, labels = get_images(root_dir)
print(f"Total images found: {len(image_paths)}")

Total images found: 5722


#Features

In [None]:
def canny_edge():
  pass

In [None]:
def dog_features():
  pass

In [None]:
def compplex_features():
  pass

In [None]:
def preprocess_img_and_apply_feats():
  #Call unitl preprocess func
  #Apply Canny func (img_canny = canny_edge(img))
  #Apply dog func (img_dog = dog_edge(img))
  #Apply both? (img_combined = dog_edge(img_canny))
  #Apply any other features
  #Return np.concatenate([img_canny,img_dog,etc.])
  pass

#Preprocess
Change X for loop to preprocess images and apply features then stack (everything else should stay the same...)



In [7]:
X = []
for p in tqdm(image_paths, desc="Pre‑processing images", unit="img"):
    X.append(preprocess_image(str(p))) #change to preprocess_img_and_apply_feats once features are in

X = np.stack(X)

Pre‑processing images: 100%|██████████| 5722/5722 [49:38<00:00,  1.92img/s]


In [8]:
le = LabelEncoder().fit(labels)
Y = le.transform(labels)

In [9]:
print(dict(zip(le.classes_, le.transform(le.classes_))))

{np.str_('glioma'): np.int64(0), np.str_('meningioma'): np.int64(1), np.str_('notumor'): np.int64(2), np.str_('pituitary'): np.int64(3)}


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size   = 0.20,
    stratify    = Y,          # keeps class proportions identical
    random_state= 42
)

In [27]:
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

In [11]:
cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Scaling / PCA

In [19]:
pca512 = ('pca', PCA(n_components=512, whiten=False, random_state=42))
scaler = ( 'scaler', StandardScaler())

#Linear SVM

In [20]:
linear_pipeline = Pipeline([
    scaler,
    pca512,
    ('linsvm', LinearSVC(C=1.0, class_weight='balanced', random_state=42))
])

#RBF SVM (Non-linear)

In [21]:
svc_pipline = Pipeline([
    scaler,
    pca512,
    ('rbf', SVC(
        C=10.0, kernel='rbf', gamma='scale',
        probability=True,
        class_weight='balanced',
        random_state=42
    ))
])

# Random Forest

In [22]:
rf = RandomForestClassifier(
        n_estimators=500, max_depth=None,
        class_weight='balanced', n_jobs=-1, random_state=42
     )

#XGBoost

In [23]:
xgb = XGBClassifier(
        n_estimators=400, max_depth=6, learning_rate=0.07,
        subsample=0.8, colsample_bytree=0.8, objective='multi:softprob',
        eval_metric='mlogloss', num_class=4, random_state=42
     )

#Base Estimators

In [24]:
base_estimators = [
    ('svm_lin', linear_pipeline),
    ('svm_rbf', svc_pipline),
    ('rf', rf),
    ('xgb', xgb),
]

In [None]:
soft_vote = VotingClassifier(
    estimators = base_estimators,
    voting = 'soft',          # average predicted probabilities
    weights = [1, 2, 1, 2],    # tweak if one model dominates
    n_jobs = -1
)
soft_vote.fit(X_train, Y_train)

# Stack Ensemble
###

In [None]:
stack = StackingClassifier(
    estimators = base_estimators,
    final_estimator= LogisticRegression(
        penalty='l2', C=1.0, max_iter=500, multi_class='multinomial'
    ),
    stack_method = 'predict_proba',   # pass probs to meta layer
    passthrough = False,
    cv = cv5
)
stack.fit(X_train, Y_train)

#Eval

In [None]:
def evaluate(model, name):
    Y_pred = model.predict(X_test)
    Y_prob = model.predict_proba(X_test)
    bal_acc = balanced_accuracy_score(Y_test, Y_pred)
    print(f'\n{name}: Balanced accuracy = {bal_acc:0.4f}')
    print(classification_report(Y_test, Y_pred, target_names=[
        'no‑tumor', 'pituitary', 'glioma', 'meningioma'
    ]))

evaluate(soft_vote, 'Soft‑Voting')
evaluate(stack, 'Stacked‑LogReg')