In [222]:
import numpy as np
import pandas as pd
#pip install seaborn 
import seaborn as sns
%matplotlib inline
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn import cross_validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [223]:
df = pd.read_csv("Data/LoanStats3a_securev1.csv",skiprows=1)
#if more than 50% values in an observation is NAN drop that observation
half_count = len(df.columns) / 2
df=df.dropna(axis='columns', how='all')
df = df.dropna(thresh=half_count)


  interactivity=interactivity, compiler=compiler, result=result)


In [224]:
df.ix[:5,:7]

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%


In [225]:
#Note: Refer to the report for detail description of the techniques
print("Feature engineering using Technique 1 started")
#work on every feature slice-by-slice which one's are informative. We would drop some useless attributes and clean-up/modify others.
# .ix[row slice, column slice]
#df.ix[:4,:7]

# We won't need id or member_id as it has no real predictive power so we can drop them from this table
df=df.drop(['id','member_id'],1)

# drop the record if loan_amnt, funded_amnt is missing
df.loan_amnt=df.loan_amnt.dropna()
df.funded_amnt=df.funded_amnt.dropna()

# if the funded_amnt_inv is missing replace it with 
df.funded_amnt_inv=df.funded_amnt_inv.fillna(0)


#int_rate was loaded as an object data type instead of float due to the '%' character. Let's strip that out and convert the column type.
df.int_rate = pd.Series(df.int_rate).str.replace('%', '').astype(float)

#replace missing values for Interest Rate with mean value
df.int_rate=df.int_rate.fillna(float(df.int_rate.mean()))

#term was loaded as an object data type instead of int due to the ' months' character. Let's strip that out and convert the column type.
df.term=pd.Series(df.term).str.replace(' months', '')

#replace missing values for Term with max value
df.term=df.term.fillna(int(df['term'].value_counts().idxmax()))


Feature engineering using Technique 1 started


In [226]:
df.ix[:5,8:15]

Unnamed: 0,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status
1,Ryder,< 1 year,RENT,30000.0,Source Verified,Dec-2011,Charged Off
3,AIR RESOURCES BOARD,10+ years,RENT,49200.0,Source Verified,Dec-2011,Fully Paid
4,University Medical Group,1 year,RENT,80000.0,Source Verified,Dec-2011,Fully Paid


In [227]:
#get the total number of value and unique values, total values are 42538 and unique values are 30660
df.emp_title.shape
df.emp_title.unique().shape

#replace missing values for emp_title with Not available
df.emp_title=df.emp_title.fillna("Not available")

#replacing missing values with 0
df.emp_length.replace('n/a', np.nan,inplace=True)
df.emp_length.fillna(value=0,inplace=True)

#convert categorical value into numerical value
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df['emp_length'] = df['emp_length'].astype(int)

#replace missing values for verification_status with Not verified
df.verification_status=df.verification_status.fillna("Not verified")

#replace missing values for home_ownership with max value
df.home_ownership=df.home_ownership.fillna("OTHER")

# drop the record if the annual_inc value is missing 
df.annual_inc=df.annual_inc.dropna()

#replace missing values for issue_d with Not available
df.issue_d=df.issue_d.fillna("Not available")

#replace missing values for loan_status with Not available
df.loan_status=df.loan_status.fillna("Not available")


In [228]:
df.ix[:5,15:21]

Unnamed: 0,pymnt_plan,url,desc,purpose,title,zip_code
1,n,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/22/11 > I plan to use t...,car,bike,309xx
3,n,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/21/11 > to pay for prop...,other,personel,917xx
4,n,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/21/11 > I plan on combi...,other,Personal,972xx


In [229]:
#these four fields would not provide any important informaation thus we are dropping them
df.drop(['pymnt_plan','url','desc','title','zip_code' ],1, inplace=True)
#replace missing values for loan_status with Not available
df.purpose=df.purpose.fillna("Not available")

In [230]:
df.ix[:5,17:25]

Unnamed: 0,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record
1,1.0,0.0,Apr-1999,740.0,744.0,5.0,,
3,20.0,0.0,Feb-1996,690.0,694.0,1.0,35.0,
4,17.94,0.0,Jan-1996,695.0,699.0,0.0,38.0,


In [231]:
#replace missing values for loan_status with Not available
df.addr_state=df.addr_state.fillna("Not available")
#replace missing values for loan_status with Not available
df.delinq_2yrs=df.delinq_2yrs.fillna(0)

# drop the record if the fico_range_high and fico_range_low value is missing 
df.fico_range_low=df.fico_range_low.dropna()
df.fico_range_high=df.fico_range_high.dropna()


#FICO fico_range_low & fico_range_high scores on their own aren't as useful as a range thus we are considering its average
df['fico_range'] = df.fico_range_low.astype('str') + '-' + df.fico_range_high.astype('str')
df['meanfico'] = (df.fico_range_low + df.fico_range_high)/2
# drop the features that are not relevant
df.drop(['fico_range_low','fico_range_high','initial_list_status', 'mths_since_last_delinq','mths_since_last_record','pub_rec','open_acc'],1, inplace=True)

#replace missing values for inq_last_6mths with 0
df.inq_last_6mths=df.inq_last_6mths.fillna(0)

In [None]:


#from datetime import datetime

#df.earliest_cr_line = pd.to_datetime(df.earliest_cr_line)

#dttoday = datetime.now().strftime('%Y-%m-%d')
# There is a better way to do this :) 
#df.earliest_cr_line = df.earliest_cr_line.apply(lambda x: (
 #       np.timedelta64((x - pd.Timestamp(dttoday)),'D').astype(int))/-365)

#df.earliest_cr_line"""



In [None]:
#dti and open_acc is yet to be taken care @@@@

In [232]:
df.ix[:10,24:35]

Unnamed: 0,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt
1,0.0,0.0,1014.53,1014.53,456.46,435.17,0.0,122.9,1.11,Apr-2013,119.66
3,0.0,0.0,12231.89,12231.89,10000.0,2214.92,16.97,0.0,0.0,Jan-2015,357.48
4,0.0,0.0,4066.908161,4066.91,3000.0,1066.91,0.0,0.0,0.0,Jan-2017,67.3
6,0.0,0.0,10137.840008,10137.84,7000.0,3137.84,0.0,0.0,0.0,May-2016,1313.76
7,0.0,0.0,3939.135294,3939.14,3000.0,939.14,0.0,0.0,0.0,Jan-2015,111.34
9,0.0,0.0,1484.59,1477.7,673.48,533.42,0.0,277.69,2.52,Nov-2012,121.45
10,0.0,0.0,7678.017673,7678.02,6500.0,1178.02,0.0,0.0,0.0,Jun-2013,1655.54


In [233]:
#replace missing values for revol_bal with 0
df.revol_bal=df.revol_bal.fillna(0)

#replace missing values for revol_util with 0
df.revol_util=df.revol_util.fillna(0)

#replace missing values for total_acc with 0
df.total_acc=df.total_acc.fillna(0)


#revol_util was loaded as an object data type instead of float due to the '%' character. Let's strip that out and convert the column type.
df.revol_util = pd.Series(df.revol_util).str.replace('%', '').astype(float)



In [234]:
#on checking the value count we see that majority portion of data is inclined towards one value thus these columns do not provide any relevant information, thus we are dropping the columns

print(df.out_prncp_inv.value_counts())
print(df.out_prncp.value_counts())

df.drop(['out_prncp_inv','out_prncp'],1, inplace=True)

0.0       33040
1382.7        1
Name: out_prncp_inv, dtype: int64
0.00       33040
1384.03        1
Name: out_prncp, dtype: int64


In [235]:
#total_pymnt, total_pymnt_inv, total_rec_prncp, total_rec_int, total_rec_late_fee are not relevant in calculating the interest rate of the user, thus dropping them

df.drop(['total_pymnt','total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee','recoveries','collection_recovery_fee','next_pymnt_d','last_credit_pull_d'],1, inplace=True)


In [236]:
df.ix[:5,26:45]


Unnamed: 0,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens,fico_range,meanfico
1,499.0,0.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,740.0-744.0,742.0
3,604.0,600.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,690.0-694.0,692.0
4,694.0,690.0,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0,695.0-699.0,697.0


In [237]:
#calculating the last mean fico score
df['last_fico_range'] = df.last_fico_range_low.astype('str') + '-' + df.last_fico_range_high.astype('str')
df['last_meanfico'] = (df.last_fico_range_low + df.last_fico_range_high)/2
df.drop(['last_fico_range_high','last_fico_range_low','policy_code'],1, inplace=True)

In [238]:
#since the value count indicate majority of the data has just one value, we are dropping the column
print(df.collections_12_mths_ex_med.value_counts())
print(df.application_type.value_counts())
print(df.acc_now_delinq.value_counts())
print(df.chargeoff_within_12_mths.value_counts())
print(df.delinq_amnt.value_counts())
print(df.pub_rec_bankruptcies.value_counts())
print(df.tax_liens.value_counts())

df.drop(['acc_now_delinq','chargeoff_within_12_mths','delinq_amnt','pub_rec_bankruptcies','tax_liens','application_type','collections_12_mths_ex_med', 'grade'],1, inplace=True)
#since the highest and lowest fico score is already considered, we can drop this field
df.drop(['fico_range', 'last_fico_range'],1, inplace=True)


0.0    33019
Name: collections_12_mths_ex_med, dtype: int64
INDIVIDUAL    33041
Name: application_type, dtype: int64
0.0    33038
1.0        3
Name: acc_now_delinq, dtype: int64
0.0    33019
Name: chargeoff_within_12_mths, dtype: int64
0.0     33040
27.0        1
Name: delinq_amnt, dtype: int64
0.0    29989
1.0     1803
2.0        8
Name: pub_rec_bankruptcies, dtype: int64
0.0    33040
1.0        1
Name: tax_liens, dtype: int64


In [239]:
#alter the dtypes of the column
df.loan_amnt  =df.loan_amnt.astype(int)
df.funded_amnt  =df.funded_amnt.astype(int)
df.annual_inc  =df.annual_inc.astype(int)
df.delinq_2yrs  =df.delinq_2yrs.astype(int)
df.inq_last_6mths  =df.inq_last_6mths.astype(int)
df.revol_bal  =df.revol_bal.astype(int)
df.total_acc  =df.total_acc.astype(int)
df.meanfico  =df.meanfico.astype(int)
df.last_meanfico  =df.last_meanfico.astype(int)
  
#math.ceil(i*100)/100
ceil_function= lambda x: math.ceil(x*100)/100
df['funded_amnt_inv']=df['funded_amnt_inv'].apply(ceil_function)

#after carefully examining each field we have shortlisted 28 features listed below
tech1_df=df
print(tech1_df.columns)
print(tech1_df.head(5))
print("Feature engineering using Technique 1 finished")

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'issue_d', 'loan_status',
       'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'revol_bal', 'revol_util', 'total_acc',
       'last_pymnt_d', 'last_pymnt_amnt', 'meanfico', 'last_meanfico'],
      dtype='object')
   loan_amnt  funded_amnt  funded_amnt_inv term  int_rate  installment  \
1       2500         2500           2500.0   60     15.27        59.83   
3      10000        10000          10000.0   36     13.49       339.31   
4       3000         3000           3000.0   60     12.69        67.79   
6       7000         7000           7000.0   60     15.96       170.08   
7       3000         3000           3000.0   36     18.64       109.43   

  sub_grade                  emp_title  emp_length home_ownership  \
1        C4                  

In [240]:


#using label encoder to convert categorical columns into numeric values
def dummyEncode(data):
        print(data.shape)
        columnsToEncode = list(data.select_dtypes(include=['category','object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                data[feature] = le.fit_transform(data[feature])
            except:
                print('Error encoding '+feature)
        return data

In [243]:
#validating the performance of the model using Technique 1 extracted features

print(df_orginal.id)
print(type(df_orginal.emp_title))
print(type(tech1_df.last_pymnt_d))

#encode categorical fetaures to numeric using Label Encoding technique
tech1_df=dummyEncode(tech1_df)
df_orginal = pd.read_csv("Data/LoanStats3a_securev1.csv",skiprows=1)
#if more than 50% values in an observation is NAN drop that observation
half_count = len(df_orginal.columns) / 2
df_orginal=df_orginal.dropna(axis='columns', how='all')
df_orginal = df_orginal.dropna(thresh=half_count)
df_orginal=dummyEncode(df_orginal)

AttributeError: 'Series' object has no attribute 'types'

AttributeError: 'DataFrame' object has no attribute 'id'

In [218]:
#Compute the corelation between the features to determine the relationship between all the features
correlations_technique1 = tech1_df.corr(method='pearson')
correlations_technique1.ix[4:5,10:20]


Unnamed: 0,annual_inc,verification_status,issue_d,loan_status,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths
int_rate,0.057697,0.203557,0.004723,-0.223102,-0.009468,-0.00105,0.122752,0.15693,-0.006197,0.178261


In [146]:
# correlations_technique2 = df_orginal.corr(method='pearson')
# correlations_technique2.ix[5:6,10:20]

Unnamed: 0,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,purpose,zip_code,addr_state
int_rate,0.03068,0.059134,0.211095,0.010701,-0.221493,,0.002916,-0.00857,0.004878,-0.001141


In [194]:
def random_split_method(data):
    X_train, X_test, Y_train, Y_test = train_test_split(data.ix[:, data.columns != 'int_rate'], data.int_rate, test_size=0.2, random_state=0)
    return X_train, X_test, Y_train, Y_test

In [195]:
def linear_regression_model(x_train, x_test, y_train, y_test):
    print("Starting Linear Regression algorithm")
    linear_reg = LinearRegression()
    fit=linear_reg.fit(x_train, y_train)

    print ("Intercept is ",linear_reg.intercept_)
    print("Coefficient is ",linear_reg.coef_)
    #print(lm.predict([18,3,0,4]))
    print("Training score is ",linear_reg.score(x_train, y_train))

    #np.mean((linear_reg.predict(X_test)-Y_test)**2)
    print("Testing score is ",linear_reg.score(x_test, y_test))


In [196]:
#by checking the correaltion between these fetaures we can choose the features
tech1_list_of_variables=['loan_amnt','funded_amnt_inv','term','grade','sub_grade','verification_status','revol_util']
tech1_df=tech1_df[tech1_list_of_variables]

#random split method for creating the training and test splits
X_train, X_test, Y_train, Y_test = random_split_method(tech1_df)
linear_regression_model(X_train, X_test, Y_train, Y_test)


KeyError: "['grade'] not in index"

In [None]:
#Note: Refer to the report for detail description of the techniques
print("Feature engineering using Technique 2 started")

cols = df.columns
# #getting the list of features that are numeric
# num_cols_list = df._get_numeric_data().columns

# #getting the list of features that are categorical
# cat_cols_list=list(set(cols) - set(num_cols_list))
# #print(df[cat_cols_list])

# #print("Next one")
# #print(df[num_cols_list])
# #X=df[num_cols_list].ix[:, df[num_cols_list].columns != 'int_rate']


# df[cat_cols_list]=dummyEncode(df[cat_cols_list])



# X=df.ix[:, df.columns != 'int_rate']
# Y=df.int_rate

# X=X.as_matrix()
# Y=Y.as_matrix()

# #Describe each features distribution  @@@ uncomment this later
# #print(df.describe())

# #Compute the corelation between the features to determine the relationship between all the features
# correlations = df.corr(method='pearson')
# #print(correlations.ix[4:5,25:35])

# new_df= pd.read_csv("Data/LoanStats3a_securev1.csv",skiprows=1)
# new_df=new_df.dropna(axis='columns', how='all')
# new_df = new_df.dropna()
# new_df=dummyEncode(new_df)

# correlations2 = new_df.corr(method='pearson')
# correlations2.ix[6:7,59:65]



In [None]:
# #@@@not working

# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# feature_test = SelectKBest(score_func=chi2, k=4)

# fit = feature_test.fit(X, Y)

# print("Selected Features: %s") % fit.support_

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
knn = KNeighborsClassifier(n_neighbors=4)
lr = LinearRegression()
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

lr = LinearRegression()

sfs = SFS(lr, 
          k_features=13, 
          forward=True, 
          floating=False, 
          scoring='neg_mean_squared_error',
          cv=10)

sfs = sfs.fit(X, Y)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
#Funded_amt_inv
#Term
#Grade
#Subgrade
#Dti
#Delinq_2_yrs
#Total_payment_inv
#Total_rec_int


# new_df= pd.read_csv("Data/LoanStats3a_securev1.csv",skiprows=1)
# new_df=new_df.dropna(axis='columns', how='all')
# new_df = new_df.dropna()
# new_df=dummyEncode(new_df)

#Y_train=df.int_rate
my_list=['loan_amnt','funded_amnt_inv','term','grade','sub_grade','verification_status','revol_util']

#X_train=df
#random split method for creating the training and test splits
X_train, X_test, Y_train, Y_test = train_test_split(df.ix[:, df.columns != 'int_rate'], df.int_rate, test_size=0.2, random_state=0)

print(len(df))
print(len(Y_train))

print(len(X_test))
print(len(Y_test))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import cross_validation

print("Starting Linear Regression algorithm")
linear_reg = LinearRegression()
fit=linear_reg.fit(X_train, Y_train)

print ("Intercept is ",linear_reg.intercept_)
print("Coefficient is ",linear_reg.coef_)
#print(lm.predict([18,3,0,4]))
print("Training score is ",linear_reg.score(X_train, Y_train))

#np.mean((linear_reg.predict(X_test)-Y_test)**2)
print("Testing score is ",linear_reg.score(X_test, Y_test))

train_scores = cross_val_score(fit, df.ix[:, df.columns != 'int_rate'], df.int_rate, cv=10)
print(train_scores)

#Simple K-Fold cross validation. 10 folds.
print(X_train.shape)
cv = cross_validation.KFold(len(X_train), n_folds=10)

results = []


# X_train=X_train.as_matrix()
# Y_train=Y_train.as_matrix()

# "Error_function" can be replaced by the error function of your analysis
for traincv, testcv in cv:
        print(X_train[800])
        probas = linear_reg.fit(X_train[traincv], Y_train[traincv])
        #results.append( Error_function )
        
print ("Results: " + str( np.array(probas).mean() ))

