In [23]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import warnings

warnings.filterwarnings('ignore')

In [24]:
data = pd.read_csv('finaltransformed.csv')


In [25]:
le = LabelEncoder()
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

# Split the dataset into features and target
X = data.drop('Credit_Score', axis=1)
y = data['Credit_Score']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create DataFrames for train and test sets
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

def get_odds_ratio(model, alpha=0.05):
    params = model.params
    conf = model.conf_int(alpha)
    conf_int = pd.DataFrame(np.exp(conf), columns=['2.5%', '97.5%'], index=params.index)
    conf_int['OR'] = np.exp(params)
    conf_int['p_value'] = model.pvalues
    return conf_int

In [27]:
# Fit a logistic regression model with all available predictors
X_train = df_train.drop('Credit_Score', axis=1)
y_train = df_train['Credit_Score']
fit_logit = sm.Logit(y_train, X_train).fit()

# Define cutoff values
cut_offs = [0.6, 0.7, 0.8]

# Loop over different variable selection methods
methods = ["forward", "backward", "both"]
result_log = pd.DataFrame()

for method in methods:
    if method == "forward":
        selector = SequentialFeatureSelector(LogisticRegression(max_iter=1000), direction='forward', n_features_to_select='auto')
    else:
        selector = SequentialFeatureSelector(LogisticRegression(max_iter=1000), direction='backward', n_features_to_select='auto')

    selector.fit(X_train, y_train)
    selected_cols = np.where(selector.get_support())[0]
    X_train_selected = X_train.iloc[:, selected_cols]
    X_test_selected = df_test.drop('Credit_Score', axis=1).iloc[:, selected_cols]

    for cutoff in cut_offs:
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train_selected, y_train)
        pred_probs = model.predict_proba(X_test_selected)[:, 1]
        pred = np.where(pred_probs < cutoff, 0, 1)

        tn, fp, fn, tp = confusion_matrix(df_test['Credit_Score'], pred).ravel()
        accuracy = accuracy_score(df_test['Credit_Score'], pred)
        precision = precision_score(df_test['Credit_Score'], pred)
        recall = recall_score(df_test['Credit_Score'], pred)
        f1 = f1_score(df_test['Credit_Score'], pred)
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)

        performance_result = [accuracy, precision, recall, f1, specificity, sensitivity]
        result_row = pd.DataFrame([performance_result], columns=['Accuracy', 'Precision', 'Recall', 'F1', 'Specificity', 'Sensitivity'])
        result_log = pd.concat([result_log, result_row], ignore_index=True)

        print(f"Results for {method} at Cutoff: {cutoff}")
        print(result_row)
        print(confusion_matrix(df_test['Credit_Score'], pred))
        print(f"Selected Formula: {' + '.join(X_train_selected.columns)}")
        print()
        print(get_odds_ratio(sm.Logit(y_train, X_train_selected).fit()))
        print()

print(result_log)

Optimization terminated successfully.
         Current function value: 0.546420
         Iterations 6
Results for forward at Cutoff: 0.6
   Accuracy  Precision    Recall        F1  Specificity  Sensitivity
0  0.752135   0.824072  0.648359  0.725731     0.858343     0.648359
[[5314  877]
 [2228 4108]]
Selected Formula: Month + Age + Occupation + Num_Bank_Accounts + Num_Credit_Card + Interest_Rate + Outstanding_Debt + Credit_History_Age + Amount_invested_monthly + Monthly_Balance

Optimization terminated successfully.
         Current function value: 0.571620
         Iterations 5
                         2.5%  97.5%        OR        p_value
Month                     NaN    NaN  0.995126   3.899108e-01
Age                       NaN    NaN  0.993678   9.147486e-08
Occupation                NaN    NaN  0.997052   3.266574e-01
Num_Bank_Accounts         NaN    NaN  1.086056   2.117908e-07
Num_Credit_Card           NaN    NaN  0.788204   8.598501e-37
Interest_Rate             NaN    NaN  1.06

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Initialize and train the Classfication Tree model

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predict probabilities
probas = decision_tree.predict_proba(X_test)[:, 1]

# Define function for calculating metrics
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = accuracy_score(y_true, y_pred)
    error_rate = 1 - accuracy
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    return accuracy, error_rate, specificity, sensitivity

# Evaluate model at different cutoffs
cutoffs = [0.6, 0.7, 0.8]
results = []
for cutoff in cutoffs:
    y_pred = (probas > cutoff).astype(int)
    results.append((cutoff,) + calculate_metrics(y_test, y_pred))

# Print results
results_df = pd.DataFrame(results, columns=['Cutoff', 'Accuracy', 'Error Rate', 'Specificity', 'Sensitivity'])
print("Decision Tree Metrics:")
print(results_df)

Decision Tree Metrics:
   Cutoff  Accuracy  Error Rate  Specificity  Sensitivity
0     0.6  0.792768    0.207232     0.782749     0.802557
1     0.7  0.792768    0.207232     0.782749     0.802557
2     0.8  0.792768    0.207232     0.782749     0.802557


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Define a function to calculate performance metrics
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = accuracy_score(y_true, y_pred)
    error_rate = 1 - accuracy
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    return accuracy, error_rate, specificity, sensitivity

# Initialize lists to store results
results = []

# Range of k values and cutoffs
k_values = range(2, 100)
cutoffs = [0.6, 0.7, 0.8]


# Evaluate each k-value
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    probas = knn.predict_proba(X_test)[:, 1]

    # Calculate metrics for each cutoff
    for cutoff in cutoffs:
        y_pred_cutoff = (probas > cutoff).astype(int)
        accuracy, error_rate, specificity, sensitivity = calculate_metrics(y_test, y_pred_cutoff)
        results.append((k, cutoff, accuracy, error_rate, specificity, sensitivity))


# Convert results to a DataFrame for easier analysis
import pandas as pd
results_df = pd.DataFrame(results, columns=['k', 'Cutoff', 'Accuracy', 'Error Rate', 'Specificity', 'Sensitivity'])
print(results_df)



      k  Cutoff  Accuracy  Error Rate  Specificity  Sensitivity
0     1     0.6  0.820388    0.179612     0.800517     0.839804
1     1     0.7  0.820388    0.179612     0.800517     0.839804
2     1     0.8  0.820388    0.179612     0.800517     0.839804
3     2     0.6  0.782630    0.217370     0.872072     0.695234
4     2     0.7  0.782630    0.217370     0.872072     0.695234
..   ..     ...       ...         ...          ...          ...
292  98     0.7  0.710226    0.289774     0.871749     0.552399
293  98     0.8  0.637423    0.362577     0.933613     0.348011
294  99     0.6  0.741518    0.258482     0.823615     0.661301
295  99     0.7  0.707751    0.292249     0.872718     0.546559
296  99     0.8  0.636066    0.363934     0.935552     0.343434

[297 rows x 6 columns]


In [None]:
results_df[results_df['Accuracy'] >= 0.75]

Unnamed: 0,k,Cutoff,Accuracy,Error Rate,Specificity,Sensitivity
0,1,0.6,0.820388,0.179612,0.800517,0.839804
1,1,0.7,0.820388,0.179612,0.800517,0.839804
2,1,0.8,0.820388,0.179612,0.800517,0.839804
3,2,0.6,0.78263,0.21737,0.872072,0.695234
4,2,0.7,0.78263,0.21737,0.872072,0.695234
5,2,0.8,0.78263,0.21737,0.872072,0.695234
6,3,0.6,0.804263,0.195737,0.75949,0.848011
9,4,0.6,0.788138,0.211862,0.829914,0.747317
10,4,0.7,0.788138,0.211862,0.829914,0.747317
12,5,0.6,0.764349,0.235651,0.874011,0.657197
