In [1]:
# 🌲 SVM Lab: Forest Cover Type — Linear vs. RBF Kernel
# Dataset: UCI Forest Cover Type (sampled to 10,000 rows)
# Goal: Compare the effect of regularization and kernels

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 📥 Load the dataset (UCI, compressed CSV)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
df = pd.read_csv(url, header=None)

# 🖼️ Show some sample data (first 5 rows)
print("📊 Sample Raw Data (before preprocessing):")
print(df.head())

# 🎯 Target: last column (cover type), features = all others
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# 🔀 Shuffle and sample 50,000 rows for speed
X, y = shuffle(X, y, random_state=42)
X = X[:10000]
y = y[:10000]

# 🔍 Standardize features (important for SVM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 🔀 Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ---------------------------------------------------
# ✅ Default setup: Linear kernel with fixed C
# ---------------------------------------------------
C = 1.0

# TODO 1: Try different values of C (e.g., 0.1, 1, 10, 30)
C_values = [0.01, 0.1, 1, 10, 30]
for C in C_values:
    model = SVC(kernel='linear', C=C)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nLinear Kernel — C={C}")
    print(classification_report(y_test, y_pred))

# TODO 2: Change kernel to 'rbf' and observe performance with default gamma
model = SVC(kernel='rbf', C=1.0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("\nRBF Kernel (default gamma)")
print(classification_report(y_test, y_pred))


# TODO 3: Try RBF kernel with different gamma values (e.g., 0.01, 0.1, 1)
# for gamma in [0.01, 0.1, 1]:
#    model = SVC(kernel='rbf', C=C, gamma=gamma)
#    model.fit(X_train, y_train)
#    y_pred = model.predict(X_test)
#    print(f"\nRBF Kernel — C={C}, gamma={gamma}")
#    print(classification_report(y_test, y_pred))

# ✅ Default model: Linear kernel with C = 1.0
model = SVC(kernel='linear', C=C)
model.fit(X_train, y_train)

# 🎯 Evaluate
y_pred = model.predict(X_test)
print(f"\nSVM Classification Report — Kernel: {model.kernel}, C={C}")
print(classification_report(y_test, y_pred))


📊 Sample Raw Data (before preprocessing):
     0    1   2    3    4     5    6    7    8     9   ...  45  46  47  48  \
0  2596   51   3  258    0   510  221  232  148  6279  ...   0   0   0   0   
1  2590   56   2  212   -6   390  220  235  151  6225  ...   0   0   0   0   
2  2804  139   9  268   65  3180  234  238  135  6121  ...   0   0   0   0   
3  2785  155  18  242  118  3090  238  238  122  6211  ...   0   0   0   0   
4  2595   45   2  153   -1   391  220  234  150  6172  ...   0   0   0   0   

   49  50  51  52  53  54  
0   0   0   0   0   0   5  
1   0   0   0   0   0   5  
2   0   0   0   0   0   2  
3   0   0   0   0   0   2  
4   0   0   0   0   0   5  

[5 rows x 55 columns]

Linear Kernel — C=0.01
              precision    recall  f1-score   support

           1       0.68      0.71      0.69      1073
           2       0.74      0.76      0.75      1468
           3       0.59      0.89      0.71       185
           4       0.00      0.00      0.00        10
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Linear Kernel — C=0.1
              precision    recall  f1-score   support

           1       0.67      0.71      0.69      1073
           2       0.74      0.76      0.75      1468
           3       0.62      0.90      0.73       185
           4       0.00      0.00      0.00        10
           5       0.00      0.00      0.00        54
           6       0.33      0.02      0.04       101
           7       0.68      0.57      0.62       109

    accuracy                           0.70      3000
   macro avg       0.44      0.42      0.40      3000
weighted avg       0.68      0.70      0.68      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Linear Kernel — C=1
              precision    recall  f1-score   support

           1       0.67      0.71      0.69      1073
           2       0.75      0.76      0.75      1468
           3       0.63      0.89      0.74       185
           4       0.33      0.10      0.15        10
           5       0.00      0.00      0.00        54
           6       0.50      0.11      0.18       101
           7       0.69      0.56      0.62       109

    accuracy                           0.71      3000
   macro avg       0.51      0.45      0.45      3000
weighted avg       0.69      0.71      0.69      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Linear Kernel — C=10
              precision    recall  f1-score   support

           1       0.67      0.71      0.69      1073
           2       0.75      0.76      0.75      1468
           3       0.64      0.87      0.74       185
           4       0.43      0.30      0.35        10
           5       0.00      0.00      0.00        54
           6       0.50      0.17      0.25       101
           7       0.73      0.56      0.63       109

    accuracy                           0.71      3000
   macro avg       0.53      0.48      0.49      3000
weighted avg       0.69      0.71      0.69      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Linear Kernel — C=30
              precision    recall  f1-score   support

           1       0.67      0.71      0.69      1073
           2       0.75      0.76      0.75      1468
           3       0.65      0.88      0.75       185
           4       0.67      0.40      0.50        10
           5       0.00      0.00      0.00        54
           6       0.53      0.19      0.28       101
           7       0.73      0.56      0.63       109

    accuracy                           0.71      3000
   macro avg       0.57      0.50      0.51      3000
weighted avg       0.69      0.71      0.70      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



RBF Kernel (default gamma)
              precision    recall  f1-score   support

           1       0.72      0.73      0.72      1073
           2       0.76      0.79      0.78      1468
           3       0.63      0.89      0.74       185
           4       0.00      0.00      0.00        10
           5       0.00      0.00      0.00        54
           6       0.45      0.10      0.16       101
           7       0.71      0.64      0.68       109

    accuracy                           0.73      3000
   macro avg       0.47      0.45      0.44      3000
weighted avg       0.71      0.73      0.71      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVM Classification Report — Kernel: linear, C=30
              precision    recall  f1-score   support

           1       0.67      0.71      0.69      1073
           2       0.75      0.76      0.75      1468
           3       0.65      0.88      0.75       185
           4       0.67      0.40      0.50        10
           5       0.00      0.00      0.00        54
           6       0.53      0.19      0.28       101
           7       0.73      0.56      0.63       109

    accuracy                           0.71      3000
   macro avg       0.57      0.50      0.51      3000
weighted avg       0.69      0.71      0.70      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


�
�
 ️ TODO 3: Experiment with Gamma (RBF Kernel)

In [2]:
# 🌲 SVM Lab: Forest Cover Type — Linear vs. RBF Kernel
# Dataset: UCI Forest Cover Type (sampled to 10,000 rows)
# Goal: Compare the effect of regularization and kernels

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 📥 Load the dataset (UCI, compressed CSV)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
df = pd.read_csv(url, header=None)

# 🖼️ Show some sample data (first 5 rows)
print("📊 Sample Raw Data (before preprocessing):")
print(df.head())

# 🎯 Target: last column (cover type), features = all others
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# 🔀 Shuffle and sample 50,000 rows for speed
X, y = shuffle(X, y, random_state=42)
X = X[:10000]
y = y[:10000]

# 🔍 Standardize features (important for SVM)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 🔀 Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ---------------------------------------------------
# ✅ Default setup: Linear kernel with fixed C
# ---------------------------------------------------
C = 1.0

# TODO 1: Try different values of C (e.g., 0.1, 1, 10, 30)
# C_values = [0.01, 0.1, 1, 10, 30]
# for C in C_values:
#     model = SVC(kernel='linear', C=C)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(f"\nLinear Kernel — C={C}")
#     print(classification_report(y_test, y_pred))

# TODO 2: Change kernel to 'rbf' and observe performance with default gamma
#     model = SVC(kernel='rbf', C=C)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(f"\nRBF Kernel (default gamma) — C={C}")
#     print(classification_report(y_test, y_pred))

for gamma in [0.01, 0.1, 1]:
    model = SVC(kernel='rbf', C=1.0, gamma=gamma)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nRBF Kernel — gamma={gamma}")
    print(classification_report(y_test, y_pred))

# ✅ Default model: Linear kernel with C = 1.0
model = SVC(kernel='linear', C=C)
model.fit(X_train, y_train)

# 🎯 Evaluate
y_pred = model.predict(X_test)
print(f"\nSVM Classification Report — Kernel: {model.kernel}, C={C}")
print(classification_report(y_test, y_pred))


📊 Sample Raw Data (before preprocessing):
     0    1   2    3    4     5    6    7    8     9   ...  45  46  47  48  \
0  2596   51   3  258    0   510  221  232  148  6279  ...   0   0   0   0   
1  2590   56   2  212   -6   390  220  235  151  6225  ...   0   0   0   0   
2  2804  139   9  268   65  3180  234  238  135  6121  ...   0   0   0   0   
3  2785  155  18  242  118  3090  238  238  122  6211  ...   0   0   0   0   
4  2595   45   2  153   -1   391  220  234  150  6172  ...   0   0   0   0   

   49  50  51  52  53  54  
0   0   0   0   0   0   5  
1   0   0   0   0   0   5  
2   0   0   0   0   0   2  
3   0   0   0   0   0   2  
4   0   0   0   0   0   5  

[5 rows x 55 columns]

RBF Kernel — gamma=0.01
              precision    recall  f1-score   support

           1       0.71      0.72      0.71      1073
           2       0.76      0.78      0.77      1468
           3       0.59      0.90      0.72       185
           4       0.00      0.00      0.00        10
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



RBF Kernel — gamma=0.1
              precision    recall  f1-score   support

           1       0.75      0.75      0.75      1073
           2       0.78      0.82      0.80      1468
           3       0.70      0.82      0.76       185
           4       0.00      0.00      0.00        10
           5       0.75      0.06      0.10        54
           6       0.57      0.32      0.41       101
           7       0.73      0.66      0.69       109

    accuracy                           0.76      3000
   macro avg       0.61      0.49      0.50      3000
weighted avg       0.75      0.76      0.75      3000


RBF Kernel — gamma=1
              precision    recall  f1-score   support

           1       0.79      0.69      0.74      1073
           2       0.69      0.88      0.77      1468
           3       0.79      0.46      0.58       185
           4       1.00      0.10      0.18        10
           5       0.71      0.09      0.16        54
           6       0.67      0.1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
