In [1]:
# data visualization and utilities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# classifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVR

In [3]:
# evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score

In [4]:
# data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [5]:
def draw(row, labels):
    # draw a label from a row with the corrsponding weight
    if len(row) != len(labels):
        print('length of row does not match length of labels')
        return 
    else:
        count_positive = sum([1 if each >= 0 else 0 for each in row])
        if count_positive == len(row):
            # all positive            
            sample = np.random.uniform(0, sum(row))
            pointer = 0
            for i in range(len(row)):
                if pointer + row[i] > sample:            
                    return y_labels[i]
                else:
                    pointer += row[i]
        elif count_positive > 0:
            temp_row = [each if each > 0 else 0 for each in row]
            sample = np.random.uniform(0, sum(temp_row))
            pointer = 0
            for i in range(len(temp_row)):
                if pointer + temp_row[i] > sample:
                    return y_labels[i]
                else:
                    pointer += temp_row[i]
        else:
            # all negative
            base = min(row)
            temp_row = [each - base for each in row]
            sample = np.random.uniform(0, sum(temp_row))
            pointer = 0
            for i in range(len(temp_row)):
                if pointer + temp_row[i] > sample:
                    return y_labels[i]
                else:
                    pointer += temp_row[i]

In [6]:
def convert(row,labels):
    if len(row) != len(labels):
        print('length of row does not match length of labels')
        return 
    else:
        max_value = row[0]
        max_index = 0
        for i in range(len(row)):
            if row[i] > max_value:
                max_value = row[i]
                max_index = i
        return labels[max_index]

In [7]:
loan_raw = pd.read_csv('kiva_loans.csv')

In [8]:
loan_std = pd.read_csv('kiva_loans_standardized.csv')

In [9]:
dataframe_features = list(loan_std.columns)

In [10]:
dataframe_score = ['repayment_interval_bullet','repayment_interval_monthly','repayment_interval_weekly','repayment_interval_irregular']

In [11]:
for i in range(len(dataframe_score)):
    dataframe_features.remove(dataframe_score[i])

In [12]:
X = loan_std[dataframe_features]
y = loan_std[dataframe_score]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [14]:
dr = DecisionTreeRegressor()

In [15]:
dr.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
y_revise = dr.predict(X_train)

In [18]:
y_labels = ['bullet','monthly','weekly','irregular']

In [19]:
y_raw_label = [convert(row, y_labels) for row in y_revise]   

In [20]:
y_true_label = [convert(row, y_labels) for row in y_train.values.tolist()]

In [30]:
y_corr = confusion_matrix(y_true_label, y_raw_label)
y_corr = y_corr.T
y_corr = [row/sum(row) for row in y_corr]
y_corr = np.array(y_corr)
y_corr_new = []
for i in range(len(y_corr)):
    temp = y_corr[i][i] / sum(y_corr[i])
    y_corr_new.append(temp) 

In [33]:
print(y_corr_new)

[0.9679821524732672, 0.9738090923994167, 0.9812189427829875, 0.8881469115191987]


In [34]:
print(classification_report(y_true_label, y_raw_label))

              precision    recall  f1-score   support

      bullet       0.97      0.99      0.98     63732
   irregular       0.97      0.98      0.97    231399
     monthly       0.98      0.98      0.98    308406
      weekly       0.89      0.97      0.93       547

   micro avg       0.98      0.98      0.98    604084
   macro avg       0.95      0.98      0.96    604084
weighted avg       0.98      0.98      0.98    604084



In [35]:
print(confusion_matrix(y_true_label, y_raw_label))

[[ 62914    185    633      0]
 [   482 225727   5125     65]
 [  1599   5873 300932      2]
 [     0     13      2    532]]


In [37]:
y_bayes_train = []
for row in y_revise:
    y_bayes_train.append([y_corr_new[i] * row[i] for i in range(len(row))])

In [38]:
y_bayes_train[0:5]

[[0.9679821524732672, 0.0, 0.0, 0.0],
 [0.0, 0.9738090923994167, 0.0, 0.0],
 [0.9679821524732672, 0.0, 0.0, 0.0],
 [0.0, 0.9738090923994167, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.8881469115191987]]

In [39]:
# y_bayes_label_train = [draw(row, y_labels) for row in y_bayes_train]
y_bayes_label_train = [convert(row, y_labels) for row in y_bayes_train]

In [40]:
y_bayes_label_train[0:5]

['bullet', 'monthly', 'bullet', 'monthly', 'irregular']

In [41]:
print(classification_report(y_true_label, y_bayes_label_train))

              precision    recall  f1-score   support

      bullet       0.98      0.97      0.98     63732
   irregular       0.97      0.98      0.97    231399
     monthly       0.98      0.98      0.98    308406
      weekly       0.89      0.98      0.93       547

   micro avg       0.98      0.98      0.98    604084
   macro avg       0.96      0.98      0.96    604084
weighted avg       0.98      0.98      0.98    604084



In [42]:
print(confusion_matrix(y_true_label, y_bayes_label_train))

[[ 61980    185   1567      0]
 [   428 225663   5243     65]
 [   665   5813 301924      4]
 [     0     13      0    534]]


In [45]:
y_pred = dr.predict(X_test)

In [46]:
y_pred_label = [convert(row, y_labels) for row in y_pred]   

In [47]:
y_test_label = [convert(row, y_labels) for row in y_test.values.tolist()]

In [48]:
print(classification_report(y_test_label, y_pred_label))

              precision    recall  f1-score   support

      bullet       0.86      0.88      0.87      6996
   irregular       0.91      0.91      0.91     25759
     monthly       0.92      0.92      0.92     34311
      weekly       0.72      0.84      0.77        55

   micro avg       0.91      0.91      0.91     67121
   macro avg       0.86      0.89      0.87     67121
weighted avg       0.91      0.91      0.91     67121



In [49]:
print(confusion_matrix(y_test_label, y_pred_label))

[[ 6164   172   659     1]
 [  234 23568  1942    15]
 [  741  2036 31532     2]
 [    0     8     1    46]]


In [53]:
y_bayes_test = []
for row in y_pred:
    y_bayes_test.append([y_corr_new[i] * row[i] for i in range(len(row))])

In [54]:
y_bayes_label_test = [convert(row, y_labels) for row in y_bayes_test]

In [55]:
print(classification_report(y_test_label, y_bayes_label_test))

              precision    recall  f1-score   support

      bullet       0.87      0.87      0.87      6996
   irregular       0.91      0.91      0.91     25759
     monthly       0.92      0.92      0.92     34311
      weekly       0.72      0.84      0.77        55

   micro avg       0.91      0.91      0.91     67121
   macro avg       0.86      0.89      0.87     67121
weighted avg       0.91      0.91      0.91     67121



In [56]:
print(confusion_matrix(y_test_label, y_bayes_label_test))

[[ 6090   172   733     1]
 [  219 23563  1962    15]
 [  666  2034 31609     2]
 [    0     8     1    46]]


In [60]:
precision_score(y_test_label, y_pred_label,average = 'weighted')

0.9135846811889121

In [61]:
precision_score(y_test_label, y_bayes_label_test,average = 'weighted')

0.913409234494649

In [156]:
vector = y_revise[3]

In [157]:
vector

array([0.        , 0.15151515, 0.        , 0.84848485])

In [158]:
y_corr

array([[ 62657,    171,    635,      0],
       [   486, 225489,   5162,     65],
       [  1594,   5852, 301441,      2],
       [     0,     12,      1,    517]], dtype=int64)

In [165]:
result = np.transpose(np.dot(y_corr, np.transpose(vector)))

In [166]:
result

array([2.59090909e+01, 3.42201515e+04, 8.88363636e+02, 4.40484848e+02])

array([0., 1., 0., 0.])

In [152]:
y_revise[4]

array([0., 1., 0., 0.])

In [155]:
y_revise[3]

array([0.        , 0.15151515, 0.        , 0.84848485])

In [162]:
y_train.iloc[3]

repayment_interval_bullet       0
repayment_interval_monthly      0
repayment_interval_weekly       0
repayment_interval_irregular    1
Name: 621599, dtype: int64

In [169]:
y_corr_norm = [row/sum(row) for row in y_corr]

In [170]:
y_corr_norm

[array([0.98729969, 0.00269448, 0.01000583, 0.        ]),
 array([2.10205794e-03, 9.75290006e-01, 2.23267965e-02, 2.81139437e-04]),
 array([5.16042980e-03, 1.89453169e-02, 9.75887778e-01, 6.47481781e-06]),
 array([0.        , 0.02264151, 0.00188679, 0.9754717 ])]

In [171]:
np.dot(y_corr_norm, vector)

array([4.08255061e-04, 1.48009756e-01, 2.87599635e-03, 8.31103488e-01])

In [182]:
8.31103488e-01 / 1.48009756e-01

5.615193960592706

In [183]:
0.84848485 / 0.15151515

5.600000066000001

In [185]:
np.var(np.dot(y_corr_norm, vector))

0.1178430309129251

In [186]:
np.var(vector)

0.12322084481175391