# RFE on raw data

In [1]:
import pandas as pd

## Load Data Set

In [None]:
# Load the data xlsx file as a dataframe
df = pd.read_csv("c:\\Users\\kiera\\OneDrive\\Documents\\GitHub\\dsif-git-main-project\\elvtr_main_project\\data\\1-raw\\lending-club-2007-2020Q3\\Loan_status_2007-2020Q3-100ksample.csv")

# Clean headers in the existing DataFrame 'df'
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Display cleaned headers
print("Cleaned headers:", df.columns.tolist())

df.shape

In [None]:
# Essential libraries for data manipulation, statistics, and visualization
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import normaltest, shapiro, anderson, kstest, skew

# Encoding and scaling libraries
import category_encoders as ce

# Visualization libraries
import matplotlib.pyplot as plt  # For standard plotting
import seaborn as sns  # For static plots with themes
import plotly.express as px  # For interactive plots
import missingno as msno  # For missing data visualization

# Machine Learning libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split, cross_val_score  # Data splitting and cross-validation
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor
)
from sklearn.svm import SVC, SVR  # Support Vector Machines for classification and regression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB  # Naive Bayes Classifier
from sklearn.cluster import KMeans  # K-Means clustering
from sklearn.decomposition import PCA  # Dimensionality reduction
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.feature_selection import RFE  # Recursive Feature Elimination

# Additional machine learning models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve, average_precision_score
)

# Utility libraries
from tqdm import tqdm
import joblib

# Pandas display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Plot settings for consistent figure size (A4 landscape top half)
FIG_WIDTH = 11.69  # Width
FIG_HEIGHT = 4.14  # Height

# Set the theme for Seaborn plots
sns.set_theme(style='whitegrid')

In [None]:
def check_infinity(df):
    infinite_list = df.isin([-np.inf, np.inf]).sum()

    if infinite_list.sum() == 0:
        print("No column has infinite values")
    else:
        print("Columns with infinite values:")
        print(infinite_list[infinite_list>0]).sort_values(ascending=False)

check_infinity(df)

In [None]:
nan_list = df.isna().sum()

if nan_list.sum() == 0:
    print("No column has NaN values")
else:
    print("Columns with NaN values (sorted high to low):")
    print(nan_list[nan_list > 0].sort_values(ascending=False))

In [None]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns in the original DataFrame
categorical_columns = [col for col in df.select_dtypes(include=['object', 'category']).columns if col != 'loan_status']

# Fill NaN in categorical columns with "Other"
df[categorical_columns] = df[categorical_columns].fillna("Other")

# Initialize and apply BinaryEncoder to categorical columns
binary_encoder = ce.BinaryEncoder(cols=categorical_columns, drop_invariant=True)
X_encoded = binary_encoder.fit_transform(df.drop(columns=['loan_status']))  # Exclude target column from encoding

# Fill NaN in numerical columns with 0
numerical_columns = X_encoded.select_dtypes(include=['number']).columns
X_encoded[numerical_columns] = X_encoded[numerical_columns].fillna(0)

# Create missing data indicators for all columns with missing values
missing_indicators = X_encoded.isna().astype(int)
missing_indicators.columns = [f"{col}_missing" for col in X_encoded.columns]

# Concatenate the original encoded data with missing indicators
X_encoded = pd.concat([X_encoded, missing_indicators], axis=1)

# Define X and y
X = X_encoded
y = df['loan_status']  # Target column

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode the target variable for both training and test sets
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Filter numerical columns from X_train
numerical_columns = X_train.select_dtypes(include=['number']).columns
X_train_numerical = X_train[numerical_columns]

# Set up the RandomForestClassifier and RFE
rf = RandomForestClassifier(n_estimators=150, random_state=42)  # Set reproducibility to 42
rfe = RFE(estimator=rf, n_features_to_select=48, step=18, verbose=3)  # Selecting 48 features, eliminating 18 per step

# Fit RFE on the filtered numerical training data
rfe.fit(X_train_numerical, y_train)

# Capture the selected numerical features
selected_features = X_train_numerical.columns[rfe.support_]

We've opted for a random forest classifier, but will evalute select kbest (from sklearn.feature_selection import SelectKBest, f_classif) at a later date.

In [None]:
# Select the top features
#selected_features = X_train.columns[rfe.support_] # I've opted to keep this for memory purpose.
selected_features_names = list(selected_features)

print("Selected Features by RFE:")
print(f"Index: {selected_features}")
print(f"Column names: {selected_features_names}")