In [2]:
# Load libraries
import pandas as pd
from pandas import set_option
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from lazypredict.Supervised import LazyClassifier
from sklearn.inspection import permutation_importance
import numpy as np
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
data = pd.read_csv("./datasets/covtype.csv")
df_name=data.columns
df_name
X=data.loc[:,'Elevation':'Soil_Type40']
Y=le.fit_transform(data['Cover_Type'])
#Features to be removed before the model
rem=['Hillshade_3pm','Soil_Type7','Soil_Type8','Soil_Type14','Soil_Type15',
     'Soil_Type21','Soil_Type25','Soil_Type28','Soil_Type36','Soil_Type37']
#Remove the unwanted features
X.drop(rem, axis=1, inplace=True)
X_train, X_test, y_train, y_test =train_test_split(X,Y,
                                                   shuffle=True,
                                                   test_size=0.25,
                                                   random_state=42,)
def find_best_k(X, y, max_k=20, cv=5):
    """
    Find the best k value for KNN classifier using cross-validation.
    
    Parameters:
    - X: Input features
    - y: Target variable
    - max_k: Maximum value of k to try (default: 20)
    - cv: Number of folds for cross-validation (default: 5)
    
    Returns:
    - best_k: Best value of k
    - best_accuracy: Cross-validation accuracy with the best k
    - best_model: Best KNN model with the best k
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a range of k values to try
    k_values = list(range(1, max_k + 1))

    # Dictionary to store mean cross-validation accuracies for each k
    cv_scores = {}

    # Perform cross-validation for each k
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_train, y_train, cv=cv)
        cv_scores[k] = np.mean(scores)

    # Find the best k based on cross-validation scores
    best_k = max(cv_scores, key=cv_scores.get)
    best_accuracy = cv_scores[best_k]

    # Train the best KNN model on the full training set
    best_model = KNeighborsClassifier(n_neighbors=best_k)
    best_model.fit(X_train, y_train)

    return best_k, best_accuracy, best_model

In [3]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train,y_train)
preds_dt = dt_model.predict(X_test)
print("The test accuracy score of Decision Tree is ",
      accuracy_score(y_test, preds_dt), f1_score(y_test, preds_dt, average='macro'))

lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train,y_train)
preds_lr = lr_model.predict(X_test)
print("The test accuracy score of logistic regression is ",
      accuracy_score(y_test, preds_lr), f1_score(y_test, preds_lr, average='macro'))


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train,y_train)
preds_rf = rf_model.predict(X_test)
print("The test accuracy score of Decision Tree is ",
      accuracy_score(y_test, preds_rf), f1_score(y_test, preds_rf, average='macro'))


xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=1)
xgb_model.fit(X_train,y_train)
preds_xgb = xgb_model.predict(X_test)
print("The test accuracy score of XGBoost is ",
      accuracy_score(y_test, preds_xgb), f1_score(y_test, preds_xgb, average='macro'))

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train,y_train)
preds_knn = knn_model.predict(X_test)
print("The test accuracy score of knn is ",
      accuracy_score(y_test, preds_knn), f1_score(y_test, preds_knn, average='macro'))

# instantiating the object and fitting
svm_model = SVC(kernel='rbf', random_state=42).fit(X_train, y_train)

# predicting the values
y_pred_svm = svm_model.predict(X_test)

# displaying the test accuracy
print("The test accuracy score of SVM is ", accuracy_score(
    y_test, y_pred_svm), f1_score(y_test, y_pred_svm, average='macro'))



The test accuracy score of Decision Tree is  0.9364212787343463 0.8952662102923742
The test accuracy score of logistic regression is  0.717486041596387 0.4908429384284287
The test accuracy score of Decision Tree is  0.9568408225647663 0.9293206264164974
The test accuracy score of XGBoost is  0.8709011173607429 0.8548544114249294


In [1]:
perm_importance = permutation_importance(
    svm_model, X_test, y_test, n_repeats=50, random_state=42,n_jobs=-1)
knn_model_perm_importance = permutation_importance(
    knn_model, X_test, y_test, n_repeats=50, random_state=42,n_jobs=-1)

# Get feature importances
rf_importances = rf_model.feature_importances_
xgb_importances = xgb_model.feature_importances_
svm_importances = perm_importance
logireg_importances = np.abs(lr_model.coef_[0])

# Create a DataFrame with feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'RandomForest': rf_importances,
    'DecissionTree': dt_model.feature_importances_,
    'XGBoost': xgb_importances,
    'Logistice Regression': logireg_importances,
    'Support Vector Mcahine': svm_importances.importances_mean,
    'K Nearest Neighbor': knn_model_perm_importance.importances_mean,
})

display(feature_importances)

NameError: name 'permutation_importance' is not defined