#**Regression discontinuity: banking recovery** <br>
Loading the Dataset

In [None]:
import pandas as pd
import numpy as py

In [None]:
df = pd.read_csv('C:/Users/Banumathi/Downloads/bank_data.csv')
print(df.shape)

#**Age Vs Expected Recovery Amount**

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.scatter(x=df['expected_recovery_amount'], y=df['age'], c="g", s=2)
plt.xlim(0, 2000)
plt.ylim(0, 60)
plt.xlabel("Expected Recovery Amount")
plt.ylabel("Age")
plt.legend(['Age'],loc=2)
plt.show()

In [None]:
df.head()

In [None]:
from scipy import stats
Thres_900_1100 = df.loc[(df['expected_recovery_amount']<1100) & 
                      (df['expected_recovery_amount']>=900)]
by_recovery_strategy = Thres_900_1100.groupby(['recovery_strategy'])
by_recovery_strategy['age'].describe().unstack()
Level_0_age = Thres_900_1100.loc[df["recovery_strategy"]=="Level 0 Recovery"]['age']
Level_1_age = Thres_900_1100.loc[df["recovery_strategy"]=="Level 1 Recovery"]['age']
stats.kruskal(Level_0_age,Level_1_age)

#**Sex vs. expected recovery amount**

In [None]:
Customer_M_F = pd.crosstab(df.loc[(df['expected_recovery_amount']<1100) & 
                              (df['expected_recovery_amount']>=900)]['recovery_strategy'], 
                       df['sex'])
print(Customer_M_F)
chi2_stat, p_val, dof, ex = stats.chi2_contingency(Customer_M_F)
print(p_val)

#**Actual Recovery Amount vs. Expected Recovery Amount**

In [None]:
plt.scatter(x=df['expected_recovery_amount'], 
            y=df['actual_recovery_amount'], c="g", s=2)
plt.xlim(900, 1100)
plt.ylim(0, 2000)
plt.xlabel("Expected Recovery Amount")
plt.ylabel("Actual Recovery Amount")
plt.legend(['actual recovery Amount'],loc=2)
plt.show()

#**Statistical analysis: recovery amount**

In [None]:
by_recovery_strategy['actual_recovery_amount'].describe().unstack()
Level_0_actual = era_900_1100.loc[df['recovery_strategy']=='Level 0 Recovery']['actual_recovery_amount']
Level_1_actual = era_900_1100.loc[df['recovery_strategy']=='Level 1 Recovery']['actual_recovery_amount']
stats.kruskal(Level_0_actual,Level_1_actual) 

For a smaller range of $ 950 to $ 1050

In [None]:
Thres_950_1050 = df.loc[(df['expected_recovery_amount']<1050) & 
                      (df['expected_recovery_amount']>=950)]
Level_0_actual = Thres_950_1050.loc[df['recovery_strategy']=='Level 0 Recovery']['actual_recovery_amount']
Level_1_actual = Thres_950_1050.loc[df['recovery_strategy']=='Level 1 Recovery']['actual_recovery_amount']
stats.kruskal(Level_0_actual,Level_1_actual)

**Regression modeling: no threshold**

In [None]:
import statsmodels.api as sm
X = Thres_900_1100['expected_recovery_amount']
y = Thres_900_1100['actual_recovery_amount']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

**adding true threshold**

In [None]:
df['indicator_1000'] = py.where(df['expected_recovery_amount'] < 1000, 0, 1)
era_900_1100 = df.loc[(df['expected_recovery_amount']<1100) & 
                      (df['expected_recovery_amount']>=900)]
X = era_900_1100[['expected_recovery_amount','indicator_1000']]
y = era_900_1100['actual_recovery_amount']
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
print(model.summary())

**adjusting the window**

In [None]:
era_950_1050 = df.loc[(df['expected_recovery_amount']<1050) & 
                      (df['expected_recovery_amount']>=900)]
X = era_950_1050[['expected_recovery_amount','indicator_1000']]
y = era_950_1050['actual_recovery_amount']
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
model.summary()