In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np




In [2]:

# Function to split and scale the dataset
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

# Function to train various classifiers and calculate feature importance
def train_and_get_feature_importance(classifier, X_train, y_train):
    classifier.fit(X_train, y_train)
    if hasattr(classifier, 'feature_importances_'):
        return classifier.feature_importances_
    elif hasattr(classifier, 'coef_'):
        return np.abs(classifier.coef_[0])
    else:
        return None

def calculate_feature_importance(X_train, y_train, X_test, selected_features):
    classifiers = {
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=0),
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "SVM Linear": SVC(kernel='linear'),
        "SVM Non-linear": SVC(kernel='rbf'),
        "KNN": KNeighborsClassifier()
    }
    
    feature_importances_dict = {}
    
    for clf_name, clf in classifiers.items():
        feature_importances = train_and_get_feature_importance(clf, X_train, y_train)
        if feature_importances is not None:
            feature_importances_dict[clf_name] = feature_importances
            
            # Sort feature importances in descending order
            indices = np.argsort(feature_importances)[::-1]
    
            # Print feature ranking
            print(f"\nFeature ranking for {clf_name}:")
            for f in range(selected_features.shape[1]):
                print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importances[indices[f]]))
    
    # Return feature importances
    return feature_importances_dict

In [3]:

# Load dataset
dataset1 = pd.read_csv("preprocessedloan_data.csv", index_col=None)
df2 = dataset1
df2 = pd.get_dummies(df2, drop_first=True)
indep_X = df2.drop(' loan_status_ Rejected', axis=1)
dep_Y = df2[' loan_status_ Rejected']


In [4]:
dataset1

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_ Not Graduate,self_employed_ Yes,loan_status_ Rejected
0,1,2,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0,0,0
1,2,0,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1,1,1
2,3,3,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0,0,1
3,4,3,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0,0,1
4,5,5,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,1000000,2300000,12,317,2800000,500000,3300000,800000,0,1,1
4265,4266,0,3300000,11300000,20,559,4200000,2900000,11000000,1900000,1,1,0
4266,4267,2,6500000,23900000,18,457,1200000,12400000,18100000,7300000,1,0,1
4267,4268,1,4100000,12800000,8,780,8200000,700000,14100000,5800000,1,0,0


In [5]:
from sklearn.feature_selection import SelectKBest

# Assuming indep_X has 4241 rows and dep_Y has 4269 rows
indep_X = indep_X[:len(dep_Y)]  # Slice indep_X to have the same number of rows as dep_Y
selected_features = SelectKBest(k=3).fit_transform(indep_X, dep_Y)

acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdes = []
accrf = []
# Perform SelectKBest feature selection
#selected_features = selectkbest(indep_X, dep_Y, 4)

# Split and scale dataset
X_train, X_test, y_train, y_test = split_scalar(selected_features, dep_Y)

# Calculate and print feature importance for various classifiers
feature_importances_dict = calculate_feature_importance(X_train, y_train, X_test, selected_features)

# Convert feature importances into DataFrame
feature_importances_df = pd.DataFrame(feature_importances_dict).T  # Transpose DataFrame
#feature_importances_df.index = indep_X.columns  # Set index to column names

feature_importances_df


Feature ranking for Random Forest:
1. feature 2 (0.929471)
2. feature 1 (0.060036)
3. feature 0 (0.010493)

Feature ranking for Logistic Regression:
1. feature 2 (3.967347)
2. feature 1 (0.817187)
3. feature 0 (0.024690)

Feature ranking for Decision Tree:
1. feature 2 (0.894300)
2. feature 1 (0.084815)
3. feature 0 (0.020885)

Feature ranking for SVM Linear:
1. feature 2 (2.817291)
2. feature 1 (0.373135)
3. feature 0 (0.035904)


Unnamed: 0,0,1,2
Random Forest,0.010493,0.060036,0.929471
Logistic Regression,0.02469,0.817187,3.967347
Decision Tree,0.020885,0.084815,0.8943
SVM Linear,0.035904,0.373135,2.817291


In [6]:
feature_importances_dict 

{'Random Forest': array([0.01049285, 0.06003624, 0.92947091]),
 'Logistic Regression': array([0.02468974, 0.81718667, 3.96734721]),
 'Decision Tree': array([0.02088476, 0.08481481, 0.89430042]),
 'SVM Linear': array([0.03590396, 0.37313542, 2.81729069])}

In [7]:
selected_features

array([[  2,  12, 778],
       [  0,   8, 417],
       [  3,  20, 506],
       ...,
       [  2,  18, 457],
       [  1,   8, 780],
       [  1,  10, 607]], dtype=int64)