### data understanding, data source from tian 
- AnnualManufactoring.csv no duplicate cusip and mdate pair

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

def print_null_freq(df):
    """
    for a given DataFrame, calculates how many values for
    each variable is null and prints the resulting table to stdout
    """
    df_lng = pd.melt(df)
    null_variables = df_lng.value.isnull()
    return pd.crosstab(df_lng.variable, null_variables)

def check_duplicate_row(df,column_names):
    """
    for a given dataframe and column names, return the unique rows of those columns, if no duplicates, return null
    input:
    df: pandas dataframe
    column_names: list of column names
    output:
    dataframe or null
    """
    

In [2]:
final_variable = pd.read_csv('data/tian_data/AnnualManufactoring.csv')

In [3]:
final_variable.columns

Index(['cusip', 'mdate', 'sigma', 'exrcamp', 'NIMTA', 'LTMTA', 'CASHMTA',
       'rsize', 'MBE', 'prc2', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8',
       'X9', 'X12', 'X11', 'X10', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18',
       'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28',
       'LTAT', 'FMTA', 'LCTMTA', 'LCTAT', 'FAT', 'Default'],
      dtype='object')

In [4]:
final_variable.head()

Unnamed: 0,cusip,mdate,sigma,exrcamp,NIMTA,LTMTA,CASHMTA,rsize,MBE,prc2,...,X25,X26,X27,X28,LTAT,FMTA,LCTMTA,LCTAT,FAT,Default
0,370106,198012,0.706719,0.120725,0.015309,0.194483,0.008908,-10.562553,4.2957,2.70805,...,-0.992452,-0.124699,0.001148,0.405362,0.62069,74.006334,0.135921,0.433791,236.190682,0
1,370106,198112,0.574407,0.108155,0.035766,0.33333,0.035457,-10.806133,2.181356,2.505526,...,-0.992452,-0.124699,0.012837,-0.474171,0.556631,43.471963,0.224138,0.374291,72.594169,0
2,775106,198412,0.776508,-0.025235,-0.29594,0.25481,0.16051,-13.194523,2.414253,-0.09844,...,-0.992452,-0.124699,0.089323,0.372078,0.494805,19.562151,0.23508,0.456494,37.987013,0
3,775106,198512,0.793844,-0.196564,-0.175986,0.190895,0.070654,-13.293126,4.194006,-0.724896,...,-0.992452,-0.124699,0.017173,0.218911,0.605344,32.409986,0.139687,0.44296,102.774923,0
4,775106,198612,0.509576,-0.195245,-0.131457,0.05447,0.069536,-12.588314,4.994316,-0.693147,...,-0.992452,-0.124699,0.653698,-0.013472,0.340933,3.97351,0.050828,0.318135,24.870466,0


In [5]:
final_variable.shape

(47189, 44)

In [6]:
final_variable['Default'].sum()

311

In [7]:
final_variable['mdate'].unique()

array([198012, 198112, 198412, 198512, 198612, 198712, 198812, 198912,
       199912, 200012, 198312, 199012, 199112, 199212, 199312, 199412,
       198212, 199512, 199612, 199712, 199812, 200112, 200212, 200312,
       200412, 200512, 200612, 200712, 200812])

In [8]:
#final_variable = final_variable.replace([np.inf,-np.inf],0)
dat_tmp = final_variable.copy()
dat_tmp['mdate'] = dat_tmp['mdate'] +100
dat_tmp = dat_tmp.drop('Default',axis =1)
Ys = final_variable[['mdate','cusip','Default']]
one_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['mdate','cusip'])

In [9]:
one_year.shape

(41003, 44)

In [10]:
one_year['Default'].sum()

254

In [11]:
drop_list = ['cusip','mdate','X14','X28']# drop X14 and X28, not in Tian's SAS code, total 39 variables, just like in the paper
train = one_year[one_year['mdate']<200312]
test = one_year[one_year['mdate']>=200312]
train = train.drop(drop_list,axis = 1)
test = test.drop(drop_list,axis = 1)
x_train = train.ix[:,0:-1]
y_train = train.ix[:,-1]
x_test = test.ix[:,0:-1]
y_test = test.ix[:,-1]

In [12]:
feature_name = x_train.columns

In [13]:
# functions to report result
def get_prob_auc(clf,x,y):
    probas_= clf.predict_proba(x)
    probas_=probas_[:,1]
    fpr,tpr,thresholds = roc_curve(y,probas_)
    roc_auc = roc_auc_score(y,probas_)
    accuracy_ratio = (roc_auc-0.5)*2
    return probas_,accuracy_ratio
def tencile_table(test,p):
    tenc_dat = pd.DataFrame({'y_true':test,'probability':p})
    tenc_dat.sort('probability',axis = 0,ascending=False, inplace = True)
    tenc_dat.index = range(0,len(tenc_dat))
    y = tenc_dat['y_true']
    point = float(len(tenc_dat))/10
    point = int(round(point))
    tenc = []
    for i in range(0,10):
        tenc.append(y[(i*point):((i+1)*point)])
    tenc[9]=tenc[9].append(y[10*point:])
    total = sum(y)
    num_of_bkr = []
    for j in range(0,10):
        num_of_bkr.append(sum(tenc[j]))
    tencile_bkr = np.array(num_of_bkr)
    rate = tencile_bkr.astype(float)/total
    tencile_result=pd.DataFrame({'Group':range(1,11),'Rate':rate})
    return tencile_result

In [14]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(C = 0.01,penalty='l1')
lg.fit(x_train,y_train)
model_coef = lg.coef_[0]

In [15]:
selected_features = {}
for idx,name in enumerate(feature_name):
    if model_coef[idx] != 0:
        selected_features[name] = model_coef[idx]
print (selected_features)
print (selected_features.keys())

{'prc2': -0.3102885779653683, 'FMTA': 0.0038469050144866552, 'rsize': 0.28281195968228795, 'FAT': 0.00019295137397865107, 'X11': -0.027388960800913888, 'X24': -0.43930668671275369}
dict_keys(['prc2', 'FMTA', 'rsize', 'FAT', 'X11', 'X24'])


In [16]:
# in-sample logistic regression result 
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
in_prob_,in_accuracy_ratio = get_prob_auc(lg, x_train,y_train)
print (tencile_table(y_train,in_prob_))
print ('in-sample logistic regression accuracy ratio is %f, auc is %f' %(in_accuracy_ratio, in_accuracy_ratio/2+0.5))

# out-sample logistic regression result 
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
out_prob_,out_accuracy_ratio = get_prob_auc(lg, x_test,y_test)
print (tencile_table(y_test,out_prob_))
print ('out-sample logistic regression accuracy ratio is %f, auc is %f' %(out_accuracy_ratio, out_accuracy_ratio/2+0.5))

   Group      Rate
0      1  0.377510
1      2  0.200803
2      3  0.112450
3      4  0.112450
4      5  0.072289
5      6  0.056225
6      7  0.032129
7      8  0.020080
8      9  0.008032
9     10  0.008032
in-sample logistic regression accuracy ratio is 0.547341, auc is 0.773671
   Group  Rate
0      1   0.2
1      2   0.0
2      3   0.2
3      4   0.0
4      5   0.0
5      6   0.4
6      7   0.0
7      8   0.2
8      9   0.0
9     10   0.0
out-sample logistic regression accuracy ratio is 0.106446, auc is 0.553223
