In [None]:
#install required libraries
!pip install pandas
pip install statsmodels
!pip install numpy
pip install matplotlib

# import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Load the data
loan = pd.read_csv("Bank_Personal_Loan_Modelling.csv")

# Create indicator variables for Family and Education
loan['FamSize2'] = np.where(loan['Family'] == 2, 1, 0)
loan['FamSize3'] = np.where(loan['Family'] == 3, 1, 0)
loan['FamSize4'] = np.where(loan['Family'] == 4, 1, 0)
loan['Educ2'] = np.where(loan['Education'] == 2, 1, 0)
loan['Educ3'] = np.where(loan['Education'] == 3, 1, 0)

# Define the response variable and predictor variables
response = loan['Personal.Loan']
predictors = ['Age', 'Experience', 'Income', 'FamSize2', 'FamSize3', 'FamSize4', 'CCAvg', 'Mortgage', 'Securities.Account', 'CD.Account', 'Online', 'CreditCard', 'Educ2', 'Educ3']
X = loan[predictors]

# Split the data into training and test sets
n = len(loan)
n1 = int(n * 0.6)
n2 = n - n1
train_idx = np.random.choice(n, n1, replace=False)
train = loan.iloc[train_idx]
test = loan.drop(train_idx)

# Fit a logistic regression model on all data
m1 = sm.Logit(response, sm.add_constant(X)).fit()
print(m1.summary())

# Fit a logistic regression model on the training data
ytrain = train['Personal.Loan']
xtrain = train[predictors]
m2 = sm.Logit(ytrain, sm.add_constant(xtrain)).fit()
print(m2.summary())

# Predict probabilities for the test data
xnew = test[predictors]
ptest = m2.predict(sm.add_constant(xnew))

# Calculate error for threshold 0.5
gg1 = (ptest >= 0.5).astype(int)
ttt = pd.crosstab(test['Personal.Loan'], gg1)
error = (ttt.iloc[0, 1] + ttt.iloc[1, 0]) / n2
print("Error (threshold 0.5):", error)

# Calculate error for threshold 0.3
gg2 = (ptest >= 0.3).astype(int)
ttt = pd.crosstab(test['Personal.Loan'], gg2)
error = (ttt.iloc[0, 1] + ttt.iloc[1, 0]) / n2
print("Error (threshold 0.3):", error)

# Stepwise variable selection
m2_aic = sm.Logit(ytrain, sm.add_constant(xtrain)).fit_regularized(method='l1', alpha=0.01)
print(m2_aic.summary())

# Predict probabilities for the test data using AIC-selected model
ptest1 = m2_aic.predict(sm.add_constant(xnew))

# Calculate error for threshold 0.5 using AIC-selected model
gg1_1 = (ptest1 >= 0.5).astype(int)
ttt1 = pd.crosstab(test['Personal.Loan'], gg1_1)
error1 = (ttt1.iloc[0, 1] + ttt1.iloc[1, 0]) / n2
print("Error (AIC threshold 0.5):", error1)

# Calculate specificity and sensitivity
specificity = ttt1.iloc[0, 0] / (ttt1.iloc[0, 0] + ttt1.iloc[0, 1])
sensitivity = ttt1.iloc[1, 1] / (ttt1.iloc[1, 0] + ttt1.iloc[1, 1])
print("Specificity:", specificity)
print("Sensitivity:", sensitivity)

# Plot lift chart
sorted_idx = np.argsort(ptest1)[::-1]
sorted_y = test['Personal.Loan'].values[sorted_idx]
cumulative_success_sorted = np.cumsum(sorted_y)
cumulative_success_average = np.arange(1, n2 + 1) * np.mean(sorted_y)
plt.plot(np.arange(1, n2 + 1), cumulative_success_sorted, label='Cumulative Success (Sorted by Pred Val)')
plt.plot(np.arange(1, n2 + 1), cumulative_success_average, label='Cumulative Success (Avg Success Prob)')
plt.xlabel('Number of Cases')
plt.ylabel('Number of Successes')
plt.title('Lift: Cumulative Successes Sorted by Pred Val / Success Prob')
plt.legend()
plt.show()

