In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
stats.chisqrob = lambda chisq, df: stats.chi2.sf(chisq,df)

from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression

In [None]:
raw_data = pd.read_csv('500_B.csv')     #import data 
DNB = raw_data.copy()
print(list(DNB.columns))                #print column names

In [None]:
pd.set_option('display.max_columns', 999)      # display setting 
DNB.head()                                     # dataset overview 

In [None]:
def format_col(col):                    # formatting the column names (headers)
    col = col.strip()
    col = col.replace(" ","_")
    col = col.lower()
    return col

new_columns = []
for c in DNB.columns:
    clean_c = format_col(c)
    new_columns.append(clean_c)
    
DNB.columns = new_columns

In [None]:
print(DNB.columns)

In [None]:
DNB.loc[:,'*1-15':'uccfilng'] = DNB.loc[:,'*1-15':'
                                        uccfilng'].astype(float)  #converting all data values to float

In [None]:
DNB.head(5)   # dataset overview (checking)

In [None]:
def find_outliers(dataslice, rows=20):           # function to replace outliers with 'None' within a range of columns
    
    def outlier(list):
        Q1 = np.percentile(list, 25)
        Q2 = np.percentile(list, 50)
        Q3 = np.percentile(list, 75)
        IQR = Q3 - Q1
        L_outlier = Q1 - 1.5*IQR
        H_outlier = Q3 + 1.5*IQR

        outlier_list = []
        for number in list:
            if number <= L_outlier or number >= H_outlier:
                outlier_list.append(number)
        return outlier_list

    
    def clear_outliers(list):
        outliers = outlier(list)
        new_list =[]
        for number in list:
            if number in outliers:
                new_list.append(None)
            else:
                new_list.append(number)
        return new_list  

    col = dataslice          #DNB.loc[:,'bd_ind':'uccfilng']
    clmn = list(col)
    for a in clmn:
        col[a] = clear_outliers(col[a])
        DNB[a] = col[a]
    
    pd.set_option('display.max_columns', 999)
    
    return DNB.head(rows)
    

In [None]:
find_outliers(DNB.loc[:,'bd_ind':'uccfilng'])

In [None]:
count_row = DNB.shape[0]  # gives number of row count
count_col = DNB.shape[1]
print(count_row,count_col)

In [None]:
DNB.drop(["bd_ind","bnkrpt","ob_ind","stmt_ind"], axis=1, inplace=True)    # drop 'None' values
DNB.dropna(inplace=True)

In [None]:
count_row = DNB.shape[0]  # gives number of row count
count_col = DNB.shape[1]
print(count_row,count_col)

In [None]:
#export csv.file
DNB.to_csv(r'C:\Users\Tom\Desktop\UF Life\ISOM\JB Hunt\JBHunt - Fall 2019\JBHunt - Fall 2019\WORK\CSV\export_dataframe_1.csv')

In [None]:
y = DNB['w_pastdue_2']     # assign y-variables
cols = ['cpct', 'crating_composite', 'd_neg', 'fpct', 'fscore', 'fspoints', 'hicdtmax', 'liens', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']
x = DNB[cols]                   # assign x-variables
x = sm.add_constant(x)

In [None]:
x.corr()

In [None]:
model = sm.OLS(y, x).fit()              # run linear regression model
#predictions = model.predict(x) 
print_model = model.summary2()
print(print_model)                      

In [None]:
print(model.pvalues)       #params, bse (Std.Err.), tvalues, pvalues, predict(), rsquared_adj, rsquared
                           
    # significant (<0.05): cpct, fpct, fspoints, liens,  

In [None]:
x = x[['const', 'cpct', 'fpct', 'fspoints', 'liens', 'crating_composite', 'd_neg', 'fscore','hicdtmax', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
print(list(x))     # manually rearrange the order of the valuables so that significant variables are in the front
                   # and list out the x variable candidates

In [None]:
def test_variables(y,x,show=False):                            # test x variable one at a time, using the adj.rsquared     
    col = x          
    clmn = list(col)
    xlist = [clmn[0],clmn[1]]
    for a in clmn[2:3]:
        model = sm.OLS(y, col[xlist]).fit()
        adj_r2 = model.rsquared_adj
        xlist.append(a)
        new_model = sm.OLS(y, col[xlist]).fit()
        new_adj_r2 = new_model.rsquared_adj
        print("old_adj_r2: ", adj_r2)
        print("new_adj_r2: ", new_adj_r2)
        print("pvalues: ")
        print(new_model.pvalues)
        print('\n')
        if new_adj_r2 >= adj_r2:
            print("KEEP variable: ", clmn[2])
        else: 
            print("DROP variable: ", clmn[2])  
        if show:
            print('\n')
            print(model.summary()) 
            print(new_model.summary()) 

In [None]:
test_variables(y,x)

logit_model=sm.Logit(y,x)
result=logit_model.fit()
print(result.summary())             #none

np.exp(result.params)

In [None]:
x = x[['const', 'cpct', 'fspoints', 'liens', 'crating_composite', 'd_neg', 'fscore','hicdtmax', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)    # manually drop 'fpct'
                       # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'crating_composite', 'd_neg', 'fscore','hicdtmax', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'fspoints'
                        # and do the adjusted rsquared test

In [None]:
def test_variables(y,x,show=False):                            # after keeping 'liens', modify the function    
    col = x          
    clmn = list(col)
    xlist = [clmn[0],clmn[1],clmn[2]]
    for a in clmn[3:4]:
        model = sm.OLS(y, col[xlist]).fit()
        adj_r2 = model.rsquared_adj
        xlist.append(a)
        new_model = sm.OLS(y, col[xlist]).fit()
        new_adj_r2 = new_model.rsquared_adj
        print("old_adj_r2: ", adj_r2)
        print("new_adj_r2: ", new_adj_r2)
        print("pvalues: ")
        print(new_model.pvalues)
        print('\n')
        if new_adj_r2 >= adj_r2:
            print("KEEP variable: ", clmn[3])
        else: 
            print("DROP variable: ", clmn[3])  
        if show:
            print('\n')
            print(model.summary()) 
            print(new_model.summary()) 

In [None]:
test_variables(y,x) 

In [None]:
x = x[['const', 'cpct', 'liens', 'd_neg', 'fscore','hicdtmax', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'crating_composite'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'fscore','hicdtmax', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'd_neg'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'hicdtmax', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'fscore'
                        # and do the adjusted rsquared test

In [None]:
# the pvalue for hicdtmax is above 0.05, drop variable

In [None]:
x = x[['const', 'cpct', 'liens', 'npayexpp', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'hicdtmax'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'orating_composite', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'npayexpp'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'payexp_n', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'orating_composite'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'paynorm', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'payexp_n'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'avg_pydx_12', 'avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'paynorm'
                        # and do the adjusted rsquared test

In [None]:
# the pvalue for hicdtmax is above 0.05, drop variable

In [None]:
x = x[['const', 'cpct', 'liens','avg_pydx_24', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'avg_pydx_12'
                        # and do the adjusted rsquared test

In [None]:
# the pvalue for hicdtmax is above 0.05, drop variable

In [None]:
x = x[['const', 'cpct', 'liens', 'sub_ind', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'avg_pydx_24'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'suits', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'sub_ind'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'totdoll', 'uccfilng']]
test_variables(y,x)     # manually drop 'suits'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens', 'uccfilng']]
test_variables(y,x)     # manually drop 'totdoll'
                        # and do the adjusted rsquared test

In [None]:
x = x[['const', 'cpct', 'liens']]      # manually drop 'uccfilng'
                                      # and show the regression result
model = sm.OLS(y, x).fit()              
print_model = model.summary()
print(print_model)     