In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np


In [5]:
def selectkbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func=chi2, k=n)
    fit1 = test.fit(indep_X, dep_Y)
    selectk_features = fit1.transform(indep_X)
    return selectk_features

def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

# Function to train various classifiers and calculate feature importance
def train_and_get_feature_importance(classifier, X_train, y_train):
    classifier.fit(X_train, y_train)
    if hasattr(classifier, 'feature_importances_'):
        return classifier.feature_importances_
    elif hasattr(classifier, 'coef_'):
        return np.abs(classifier.coef_[0])
    else:
        return None

def calculate_feature_importance(X_train, y_train, X_test, selected_features):
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=0),
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "SVM Linear": SVC(kernel='linear'),
        "SVM Non-linear": SVC(kernel='rbf'),
        "KNN": KNeighborsClassifier()
    }
    
    feature_importances_dict = {}
    
    for clf_name, clf in classifiers.items():
        feature_importances = train_and_get_feature_importance(clf, X_train, y_train)
        if feature_importances is not None:
            feature_importances_dict[clf_name] = feature_importances
            
            # Sort feature importances in descending order
            indices = np.argsort(feature_importances)[::-1]
    
            # Print feature ranking
            print(f"\nFeature ranking for {clf_name}:")
            for f in range(selected_features.shape[1]):
                print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importances[indices[f]]))
    
    # Return feature importances
    return feature_importances_dict



In [9]:
# Load dataset
dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = pd.get_dummies(dataset1, drop_first=True)
indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

# Perform SelectKBest feature selection
selected_features = selectkbest(indep_X, dep_Y, 4)

# Split and scale dataset
X_train, X_test, y_train, y_test = split_scalar(selected_features, dep_Y)

# Calculate and print feature importance for various classifiers
feature_importances_dict = calculate_feature_importance(X_train, y_train, X_test, selected_features)

# Convert feature importances into DataFrame
feature_importances_df = pd.DataFrame(feature_importances_dict).T  # Transpose DataFrame
feature_importances_df.index = indep_X.columns  # Set index to column names

# Display feature importances DataFrame
print("\nFeature importances DataFrame:")
print(feature_importances_df)


Feature ranking for Random Forest:
1. feature 2 (0.489312)
2. feature 0 (0.216364)
3. feature 1 (0.176700)
4. feature 3 (0.117624)

Feature ranking for Logistic Regression:
1. feature 2 (3.517234)
2. feature 0 (1.858775)
3. feature 1 (1.007374)
4. feature 3 (0.443108)

Feature ranking for Decision Tree:
1. feature 2 (0.640780)
2. feature 3 (0.138889)
3. feature 0 (0.134131)
4. feature 1 (0.086200)

Feature ranking for SVM Linear:
1. feature 2 (4.598870)
2. feature 0 (1.317196)
3. feature 1 (0.866651)
4. feature 3 (0.230976)


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
