# Bank Recovery Amount Prediction

### Data cleaning

In [None]:
# importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#import dataset
df=pd.read_csv("bank_data.csv")
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df["recovery_strategy"].value_counts()

In [None]:
df["age"].nunique()

In [None]:
df["sex"].value_counts()

## Graphical exploratory data analysis

In [None]:
sns.kdeplot(df["expected_recovery_amount"],color="teal")
plt.show()

In [None]:
sns.kdeplot(df["actual_recovery_amount"],color="teal")
plt.show()

## Statistical test: age vs. expected recovery amount

In [None]:
# range between $900 and $1100

df_range1=df[(df["expected_recovery_amount"]>=900) & (df["expected_recovery_amount"]<=1100)]
df_below1=df_range1[df_range1["expected_recovery_amount"]<=1000]
df_above1=df_range1[df_range1["expected_recovery_amount"]>1000]

In [None]:
print("Average age above $1000 expected recovery",df_above1["age"].mean())
print("Average age below $1000 expected recovery",df_below1["age"].mean())

In [None]:
sns.scatterplot(x="age",y="expected_recovery_amount",data=df_range1,color="teal")
plt.show()

* The average age above and below \\$1000 expected recovery amount in a range between \\$900 and \\$1100 is similar
* in the scatterplot ,it is visible there is no jump in \$1000
* It means,differences in the actual recovery amount are due to the higher Recovery Strategy and not due to difference in age 

## Statistical test: sex vs. expected recovery amount

In [None]:
df_above1["sex"].value_counts()

In [None]:
df_below1["sex"].value_counts()

In [None]:
print("percentage of male above $1000 is ","{:.2f}".format((55/112)*100),"%")
print("percentage of male below $1000 is ","{:.2f}".format((57/112)*100),"%")
print()
print("percentage of female above $1000 is ","{:.2f}".format((40/72)*100),"%")
print("percentage of female below $1000 is ","{:.2f}".format((32/72)*100),"%")

In [None]:
fig, axes = plt.subplots(1, 2,figsize=(10,5))
df_above1["sex"].value_counts().plot(kind="bar",ax=axes[0],color="teal").set(title="sex ratio in df_above")
df_below1["sex"].value_counts().plot(kind="bar",ax=axes[1],color="teal").set(title="sex ratio in df_below")
plt.show()

In [None]:
sns.scatterplot(x="age",y="expected_recovery_amount",hue="sex",data=df_range1,palette=("teal","coral"))
plt.show()

* The percentage of customers that are male does not jump across the $1000 threshold
* It means,differences in the actual recovery amount are due to the higher Recovery Strategy and not due to gender 

## Exploratory graphical analysis: recovery amount

In [None]:
sns.scatterplot(y="expected_recovery_amount",x="actual_recovery_amount",data=df_range1,color="teal")
plt.ylim(900,1100)
plt.xlim(250,)
plt.rc('grid', linestyle=":", color='gray')
plt.grid(True)
plt.show()

## Statistical analysis: recovery amount

In [None]:
from scipy import stats 

* Null hypothesis: Below and above \$1000 threshold, the actual recovery amount behaves the same
* Alternate hypothesis:The actual recovery amount has a discontinuity above the \$1000 threshold

In [None]:
def value(pvalue):
    if pvalue<0.05:
        print("Since P Value is less than 0.05,we reject the null hypothesis. That means The actual recovery amount has a discontinuity above the $1000 threshold")
    else:
        print("Since P Value is greater than 0.05,we reject the Alternate hhypothesis.That means below and above $1000 threshold, the actual recovery amount behaves the same")

In [None]:
# window 1
# between $900 to $1000 ie df_range1

df1_window1 = df_above1["actual_recovery_amount"]
df2_window1 = df_below1["actual_recovery_amount"]

# the Kruskal-Wallis Test 
statistics,pvalue= stats.kruskal(df1_window1,df2_window1)
 
print("Statistics: ",statistics)
print("P Value: ",pvalue)
value(pvalue)

In [None]:
# window 2
# between $950 to $1050 ie df_range2

df_range2=df[(df["expected_recovery_amount"]>=950) & (df["expected_recovery_amount"]<=1050)]
df_below2=df_range2[df_range2["expected_recovery_amount"]<=1000]
df_above2=df_range2[df_range2["expected_recovery_amount"]>1000]


df1_window2 = df_above2["actual_recovery_amount"]
df2_window2 = df_below2["actual_recovery_amount"]

# the Kruskal-Wallis Test 
statistics,pvalue= stats.kruskal(df1_window2,df2_window2)
 
print("Statistics: ",statistics)
print("P Value: ",pvalue)
value(pvalue)

* In both ranges(windows) the p value is less than 0.05
* This means the actual recovery amount has a discontinuity above the $1000 threshold.

## Regression modeling: no threshold

In [None]:
# pip install statsmodels

In [None]:
import statsmodels.api as sm

In [None]:
x=df_range1["expected_recovery_amount"]
y=df_range1["actual_recovery_amount"]
x = sm.add_constant(x)

In [None]:
model = sm.OLS(y, x)
results =model.fit()
print(results.summary())

## Regression modeling: adding true threshold

In [None]:
df["threshold"]=np.where(df['expected_recovery_amount']<1000, 0, 1)

In [None]:
df

In [None]:
new_df_range1=df[(df["expected_recovery_amount"]>=900) & (df["expected_recovery_amount"]<1100)]

In [None]:
x=new_df_range1[["expected_recovery_amount","threshold"]]
y=new_df_range1["actual_recovery_amount"]
x = sm.add_constant(x)

In [None]:
new_model1 = sm.OLS(y, x)
new_results1 =new_model1.fit()
print(new_results1.summary())

* The regression coefficient for the true threshold was statistically significant with an estimated impact of around \\$277 and a 95 percent confidence interval of \\$131 to \\$423.This is much larger than the incremental cost of running the higher recovery strategy which was \\$50 per customer.
*  At this point, we are feeling reasonably confident that the higher recovery strategy is worth the additional costs of the program for customers just above and just below the threshold

## Regression modeling: adjusting the window

In [None]:
new_df_range2=df[(df["expected_recovery_amount"]>=950) & (df["expected_recovery_amount"]<1050)]

In [None]:
x=new_df_range2[["expected_recovery_amount","threshold"]]
y=new_df_range2["actual_recovery_amount"]
x = sm.add_constant(x)

In [None]:
new_model2 = sm.OLS(y, x)
new_results2 =new_model2.fit()
print(new_results2.summary())