In [None]:
!pip install xgboost



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
data = fetch_covtype()

X = pd.DataFrame(data.data)
y = pd.Series(data.target - 1)

print("Shape of dataset:", X.shape)
print("Number of classes:", len(np.unique(y)))

Shape of dataset: (581012, 54)
Number of classes: 7


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training size:", X_train.shape)
print("Testing size:", X_test.shape)

Training size: (464809, 54)
Testing size: (116203, 54)


In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(y.value_counts())

1    283301
0    211840
2     35754
6     20510
5     17367
4      9493
3      2747
Name: count, dtype: int64


In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # For multi-class AUC
    y_prob = model.predict_proba(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)

    print(f"\n{model_name} Results")
    print("Accuracy:", accuracy)
    print("AUC:", auc)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("MCC:", mcc)

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "AUC": auc,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "MCC": mcc
    }

In [None]:
log_model = LogisticRegression(max_iter=1000, multi_class='auto')

log_results = evaluate_model(
    log_model, X_train, X_test, y_train, y_test,
    "Logistic Regression"
)




Logistic Regression Results
Accuracy: 0.7235011144290595
AUC: 0.936255553243064
Precision: 0.7109395984401812
Recall: 0.7235011144290595
F1 Score: 0.7138468586463106
MCC: 0.5469093890790228

Confusion Matrix:
[[29534 12009    15     0     0    14   796]
 [10224 45330   718     3    44   316    26]
 [    0   727  5730   138     9   547     0]
 [    0     3   233   240     0    73     0]
 [    5  1818    56     0    10    10     0]
 [    0   811  1704    12     4   942     0]
 [ 1775    40     0     0     0     0  2287]]

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.70      0.70     42368
           1       0.75      0.80      0.77     56661
           2       0.68      0.80      0.73      7151
           3       0.61      0.44      0.51       549
           4       0.15      0.01      0.01      1899
           5       0.50      0.27      0.35      3473
           6       0.74      0.56      0.63      4102

    accuracy    

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)

dt_results = evaluate_model(
    dt_model, X_train, X_test, y_train, y_test,
    "Decision Tree"
)


Decision Tree Results
Accuracy: 0.9388656058793663
AUC: 0.9451711886169536
Precision: 0.9388560384860974
Recall: 0.9388656058793663
F1 Score: 0.9388590961442232
MCC: 0.9018675784293675

Confusion Matrix:
[[39735  2403     2     0    34     4   190]
 [ 2374 53739   154     1   241   123    29]
 [    0   162  6653    61    19   256     0]
 [    0     1    67   469     0    12     0]
 [   34   239    19     0  1604     3     0]
 [   12   123   276    28     4  3030     0]
 [  200    32     0     0     1     0  3869]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     42368
           1       0.95      0.95      0.95     56661
           2       0.93      0.93      0.93      7151
           3       0.84      0.85      0.85       549
           4       0.84      0.84      0.84      1899
           5       0.88      0.87      0.88      3473
           6       0.95      0.94      0.94      4102

    accuracy         

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)

knn_results = evaluate_model(
    knn_model, X_train, X_test, y_train, y_test,
    "KNN"
)


KNN Results
Accuracy: 0.9284355825581095
AUC: 0.9838703110491823
Precision: 0.9282035795354151
Recall: 0.9284355825581095
F1 Score: 0.9282421280070416
MCC: 0.8848919978950537

Confusion Matrix:
[[39238  2879     4     0    33     8   206]
 [ 2454 53651   147     0   211   161    37]
 [   10   193  6491    50    15   392     0]
 [    0     3   109   396     0    41     0]
 [   51   369    17     0  1450    12     0]
 [   14   184   412    23    10  2830     0]
 [  235    35     0     0     1     0  3831]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     42368
           1       0.94      0.95      0.94     56661
           2       0.90      0.91      0.91      7151
           3       0.84      0.72      0.78       549
           4       0.84      0.76      0.80      1899
           5       0.82      0.81      0.82      3473
           6       0.94      0.93      0.94      4102

    accuracy                   

In [None]:
nb_model = GaussianNB()

nb_results = evaluate_model(
    nb_model, X_train, X_test, y_train, y_test,
    "Naive Bayes"
)


Naive Bayes Results
Accuracy: 0.08947273306196914
AUC: 0.7969954462965172
Precision: 0.4939779832625105
Recall: 0.08947273306196914
F1 Score: 0.05772255648495189
MCC: 0.0688227757404189

Confusion Matrix:
[[ 1043   103   423     0 22773   321 17705]
 [ 5460   524  4800   630 37977   478  6792]
 [    0     0  2875  4229    32    15     0]
 [    0     0     0   549     0     0     0]
 [    0     2   454     0  1410     8    25]
 [    0     0  1083  1929   267   190     4]
 [   12     0    19     0   265     0  3806]]

Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.02      0.04     42368
           1       0.83      0.01      0.02     56661
           2       0.30      0.40      0.34      7151
           3       0.07      1.00      0.14       549
           4       0.02      0.74      0.04      1899
           5       0.19      0.05      0.08      3473
           6       0.13      0.93      0.23      4102

    accuracy        

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_results = evaluate_model(
    rf_model, X_train, X_test, y_train, y_test,
    "Random Forest"
)


Random Forest Results
Accuracy: 0.953271430169617
AUC: 0.9977961787247037
Precision: 0.9534016359304045
Recall: 0.953271430169617
F1 Score: 0.9530312754596622
MCC: 0.9248201878555364

Confusion Matrix:
[[39900  2366     1     0     6     2    93]
 [ 1305 55110   106     1    56    69    14]
 [    0   122  6859    23     7   140     0]
 [    0     0    57   471     0    21     0]
 [   25   385    20     0  1460     9     0]
 [    5    91   258    18     5  3096     0]
 [  197    28     0     0     0     0  3877]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     42368
           1       0.95      0.97      0.96     56661
           2       0.94      0.96      0.95      7151
           3       0.92      0.86      0.89       549
           4       0.95      0.77      0.85      1899
           5       0.93      0.89      0.91      3473
           6       0.97      0.95      0.96      4102

    accuracy           

In [None]:
xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=7,
    eval_metric='mlogloss',
    use_label_encoder=False
)

xgb_results = evaluate_model(
    xgb_model, X_train, X_test, y_train, y_test,
    "XGBoost"
)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Results
Accuracy: 0.8681531457879745
AUC: 0.9862155199278241
Precision: 0.8683038628295605
Recall: 0.8681531457879745
F1 Score: 0.8674471640522218
MCC: 0.7871320508030843

Confusion Matrix:
[[35455  6709     5     0    15     3   181]
 [ 5306 50779   287     1   122   142    24]
 [    3   288  6534    40     0   286     0]
 [    0     0    56   471     0    22     0]
 [   19   723    36     0  1108    13     0]
 [    5   218   461    21     1  2767     0]
 [  323    11     0     0     0     0  3768]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85     42368
           1       0.86      0.90      0.88     56661
           2       0.89      0.91      0.90      7151
           3       0.88      0.86      0.87       549
           4       0.89      0.58      0.70      1899
           5       0.86      0.80      0.83      3473
           6       0.95      0.92      0.93      4102

    accuracy               

In [None]:
results = pd.DataFrame([
    log_results,
    dt_results,
    knn_results,
    nb_results,
    rf_results,
    xgb_results
])

results

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.723501,0.936256,0.71094,0.723501,0.713847,0.546909
1,Decision Tree,0.938866,0.945171,0.938856,0.938866,0.938859,0.901868
2,KNN,0.928436,0.98387,0.928204,0.928436,0.928242,0.884892
3,Naive Bayes,0.089473,0.796995,0.493978,0.089473,0.057723,0.068823
4,Random Forest,0.953271,0.997796,0.953402,0.953271,0.953031,0.92482
5,XGBoost,0.868153,0.986216,0.868304,0.868153,0.867447,0.787132


In [None]:
import os

os.makedirs("project-folder/model", exist_ok=True)

In [None]:
!pip install joblib



In [None]:
import joblib

In [None]:
joblib.dump(log_model, "project-folder/model/logistic_regression.pkl")
joblib.dump(dt_model, "project-folder/model/decision_tree.pkl")
joblib.dump(knn_model, "project-folder/model/knn.pkl")
joblib.dump(nb_model, "project-folder/model/naive_bayes.pkl")
joblib.dump(rf_model, "project-folder/model/random_forest.pkl")
joblib.dump(xgb_model, "project-folder/model/xgboost.pkl")

['project-folder/model/xgboost.pkl']

In [None]:
joblib.dump(scaler, "project-folder/model/scaler.pkl")

['project-folder/model/scaler.pkl']

In [None]:
results.to_csv("project-folder/model/model_comparison.csv", index=False)

In [None]:
test_data = pd.DataFrame(X_test)
test_data["Cover_Type"] = y_test + 1

test_data.to_csv("project-folder/sample_test_data.csv", index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')