# Import Library

In [1]:
# data visualization and utilities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# classifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVR

In [3]:
# evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score

In [4]:
# data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [5]:
def regression_report(y_test, y_pred):
    
    space_tab = 3
    
    if len(y_test) != len(y_pred):
        return 'length of true labels and predicted labels are not equal.'
    
    report = []
    
    # labels
    labels = set(y_test)
    for label in labels:
        test = [1 if each==label else 0 for each in y_test]
        pred = [1 if each==label else 0 for each in y_pred]
        report.append([
            label, 
            round(r2_score(test,pred), 3), 
            round(mean_squared_error(test, pred),3), 
            round(explained_variance_score(test,pred),3), 
            y_test.count(label)
        ])
    
    # macro
    macro = [
        'macro avg', 
        round(sum([row[1] for row in report]) / len(labels),3), 
        round(sum([row[2] for row in report]) / len(labels),3), 
        round(sum([row[3] for row in report]) / len(labels),3), 
        sum([row[4] for row in report])
    ]           
    
    # micro    
    diff = [1 if y_test[i]==y_pred[i] else 0 for i in range(len(y_test))]
    same = [1] * len(y_test)
    micro = [
        'micro avg', 
        round(r2_score(diff,same),3), 
        round(mean_squared_error(diff,same), 3),
        round(explained_variance_score(diff,same), 3),
        len(y_test)
    ]
    
    #formatting
    space = ['    ', '    ', '    ', '    ', '    ']    
    header = ['    ', 'r2_score', 'mean_squared_error', 'explained_variance_score', 'support']
    
    # add all the things    
    report.insert(0,space)
    report.insert(0,header)
    report.append(space)
    report.append(micro)
    report.append(macro)        
    
    result = ''
    
    col = []
    for i in range(len(report[0])):
        col.append(max([len(str(row[i])) for row in report])+space_tab)
    
    for row in report:
        for i in range(len(row)):
            result += str(row[i]).rjust(col[i], ' ')
        result += '\n'
    return result   

In [6]:
def randomUniform(row):
    prob = np.random.uniform(0, sum(row))
    progress = 0
    for i in range(len(row)):
        if progress + row[i] > prob:
#             print(i)
            return int(i)
#             break
        progress += row[i]

# Import Dataset

In [7]:
loan_raw = pd.read_csv('kiva_loans.csv')

In [None]:
loan_cod = pd.read_csv('kiva_loans_dummied.csv')

In [None]:
loan_std = pd.read_csv('kiva_loans_standardized.csv')

# Model Selection

## Standardized and One-hot Encoded Dataset

In [None]:
label_features = ['repayment_interval_irregular', 'repayment_interval_monthly', 'repayment_interval_weekly', 'repayment_interval_bullet']
predict_features = list(loan_std.columns)
for each in label_features:
    predict_features.remove(each)

In [None]:
y = loan_std[label_features]
X_std = loan_std[selected_features]

In [None]:
X_std.head()

In [None]:
y.head()

In [308]:
model = LinearRegression()
X = X_std
# y = y
ALL_TRUE_LABEL = []
ALL_PRED_LABEL = []
kf = KFold(n_splits=10)
i = 0
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index].values.tolist(), y.iloc[test_index].values.tolist()

    model.fit(X_train,y_train)
    
    ALL_PRED_LABEL.extend(model.predict(X_test))
    ALL_TRUE_LABEL.extend(y_test)

    # Screen Output for tracking the progress, sometimes I wait too long......
    print('Finish Test Iteration ',i)
    i += 1
#     break

# print(classification_report(ALL_TRUE_LABEL, ALL_PRED_LABEL))
# print(regression_report(ALL_TRUE_LABEL, ALL_PRED_LABEL))
# print(confusion_matrix(ALL_TRUE_LABEL, ALL_PRED_LABEL))

Finish Test Iteration  0
Finish Test Iteration  1
Finish Test Iteration  2
Finish Test Iteration  3
Finish Test Iteration  4
Finish Test Iteration  5
Finish Test Iteration  6
Finish Test Iteration  7
Finish Test Iteration  8
Finish Test Iteration  9


In [309]:
choices = ['repayment_interval_irregular','repayment_interval_monthly','repayment_interval_weekly','repayment_interval_bullet']

reg_label = [np.where(row == np.amax(row))[0] for row in ALL_PRED_LABEL]
reg_label = [choices[each[0]] for each in reg_label]

tru_label = [np.where(row == np.amax(row))[0] for row in ALL_TRUE_LABEL]
tru_label = [choices[each[0]] for each in tru_label]

In [310]:
print(classification_report(tru_label,reg_label))
print(confusion_matrix(tru_label,reg_label))

  'precision', 'predicted', average, warn_for)


                              precision    recall  f1-score   support

   repayment_interval_bullet       0.72      0.60      0.65     70728
repayment_interval_irregular       0.86      0.75      0.80    257158
  repayment_interval_monthly       0.78      0.87      0.82    342717
   repayment_interval_weekly       0.00      0.00      0.00       602

                   micro avg       0.80      0.80      0.80    671205
                   macro avg       0.59      0.56      0.57    671205
                weighted avg       0.80      0.80      0.80    671205

[[ 42714   4231  23783      0]
 [  1666 194122  61370      0]
 [ 15340  27960 299417      0]
 [     0     17    585      0]]


In [311]:
bayes_filter = confusion_matrix(tru_label,reg_label)

In [316]:
bayes_label = [np.dot(bayes_filter, row) for row in ALL_PRED_LABEL]
bayes_label = [randomUniform(row) for row in bayes_label]

In [317]:
bayes_label[0:5]

[1, 2, 1, 1, 1]

In [318]:
# bayes_labels = [choices[i] for i in bayes_label]
bayes_labels = []
step = 0
for i in bayes_label:
    if i == 0:
        bayes_labels.append(choices[0])
    elif i == 1:
        bayes_labels.append(choices[1])
    elif i == 2:
        bayes_labels.append(choices[2])
    else:
        bayes_labels.append(choices[3])
    step += 1
bayes_label = bayes_labels
print(step)

671205


In [319]:
print(classification_report(tru_label,bayes_labels))
print(confusion_matrix(tru_label,bayes_label))

                              precision    recall  f1-score   support

   repayment_interval_bullet       0.98      0.02      0.04     70728
repayment_interval_irregular       0.73      0.42      0.53    257158
  repayment_interval_monthly       0.64      0.78      0.71    342717
   repayment_interval_weekly       0.00      0.16      0.00       602

                   micro avg       0.56      0.56      0.56    671205
                   macro avg       0.59      0.35      0.32    671205
                weighted avg       0.71      0.56      0.57    671205



array([[  1538,  12138,  48234,   8818],
       [    12, 107768,  99600,  49778],
       [    25,  27521, 268474,  46697],
       [     0,     80,    424,     98]], dtype=int64)

In [114]:
bayes_filter

array([[ 1389,   681,  3034,     0],
       [  277, 16876,  9766,     0],
       [ 2669,  2267, 30109,     0],
       [    0,     2,    51,     0]], dtype=int64)

In [115]:
ALL_TRUE_LABEL[0]

[1, 0, 0, 0]

In [116]:
ALL_PRED_LABEL[0]

array([ 0.258955  ,  0.76438522, -0.00121742, -0.02210999])

In [119]:
temp2 = np.dot(bayes_filter, ALL_PRED_LABEL[0])
temp2

array([8.76541168e+02, 1.29596062e+04, 2.38735676e+03, 1.46668178e+00])

In [132]:
prob = np.random.uniform(0, sum(temp2))
progress = 0
for i in range(len(temp2)):
    if progress + temp2[i] > prob:
        print(i)
        break
    progress += temp2[i]

1
