<a href="https://colab.research.google.com/github/Su-ok/MT2025124_ML_Project2/blob/main/MultiSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)

# ==== LOAD DATA ====
data = pd.read_csv("/content/drive/MyDrive/ML kaggle data/forest_cover/covtype.csv")

print("Dataset shape:", data.shape)
print(data.columns)   # Always verify columns

# ==== BASIC PREPROCESSING ====
X = data.drop("Cover_Type", axis=1)
y = data["Cover_Type"]

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Unique classes:", np.unique(y))

# ==== SCALING ====
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ==== TRAIN / VALIDATION SPLIT (STRATIFIED) ====
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train:", X_train.shape, "X_val:", X_val.shape)

# ==== DEFINE MULTICLASS SVM ====
# LinearSVC handles multiclass via One-vs-Rest internally
base_svm = LinearSVC()

param_dist = {
    "C": [0.01, 0.1, 1.0],
    "loss": ["hinge", "squared_hinge"],
    "max_iter": [2000, 5000]
}

cv_strat = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

svm_search = RandomizedSearchCV(
    estimator=base_svm,
    param_distributions=param_dist,
    n_iter=6,
    scoring="accuracy",
    n_jobs=-1,
    cv=cv_strat,
    random_state=42,
    verbose=2
)

svm_search.fit(X_train, y_train)

print("\nBest Parameters Found:")
print(svm_search.best_params_)
print(f"Best CV Accuracy: {svm_search.best_score_:.6f}")

# ==== EVALUATION ====
best_svm = svm_search.best_estimator_

y_train_pred = best_svm.predict(X_train)
y_val_pred   = best_svm.predict(X_val)

train_acc = accuracy_score(y_train, y_train_pred)
val_acc   = accuracy_score(y_val, y_val_pred)

print(f"\nTraining Accuracy (SVM):   {train_acc:.6f}")
print(f"Validation Accuracy (SVM): {val_acc:.6f}")

# ==== FINAL TRAINING ON FULL DATA ====
full_svm = LinearSVC(
    C=svm_search.best_params_["C"],
    loss=svm_search.best_params_["loss"],
    max_iter=svm_search.best_params_["max_iter"]
)

full_svm.fit(X_scaled, y)

print("\nFinal SVM model trained on full Forest Cover dataset.")

Dataset shape: (581012, 55)
Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
      

In [None]:
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


Classification Report:
              precision    recall  f1-score   support

           1       0.71      0.68      0.69     42368
           2       0.74      0.80      0.76     56661
           3       0.61      0.87      0.72      7151
           4       0.62      0.20      0.30       549
           5       0.56      0.01      0.02      1899
           6       0.43      0.06      0.10      3473
           7       0.68      0.51      0.58      4102

    accuracy                           0.71    116203
   macro avg       0.62      0.45      0.46    116203
weighted avg       0.70      0.71      0.70    116203


Confusion Matrix:
[[28915 12539    45     0     2     0   867]
 [ 9975 45136  1346     1    17    79   107]
 [    0   780  6195    52     0   124     0]
 [    0     0   378   111     0    60     0]
 [   37  1678   160     0    24     0     0]
 [    0  1253  2005    15     0   200     0]
 [ 1972    23    19     0     0     0  2088]]
