In [34]:
# data visualization and utilities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import graphviz
%matplotlib inline

In [35]:
# classifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN

In [36]:
# evaluation
from sklearn.metrics import classification_report, confusion_matrix, precision_score
from sklearn.model_selection import KFold, GridSearchCV

In [37]:
# data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [38]:
loan = pd.read_csv('kiva_loans.csv')

In [39]:
loan_std = pd.read_csv('kiva_loans_standardized.csv')

In [40]:
selected_features = list(loan_std.columns)
selected_features.remove('repayment_interval_irregular')
selected_features.remove('repayment_interval_monthly')
selected_features.remove('repayment_interval_weekly')
selected_features.remove('repayment_interval_bullet')

In [41]:
y = loan['repayment_interval']
X = loan_std[selected_features]

In [42]:
X = pd.DataFrame(X)

In [45]:
model = Perceptron(max_iter=1000, tol=1e-3)

ALL_TRUE_LABEL = []
ALL_PRED_LABEL = []
kf = KFold(n_splits=10)
i = 0

for train_index, test_index in kf.split(X):    

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train,y_train)
    ALL_PRED_LABEL.extend(model.predict(X_test))
    ALL_TRUE_LABEL.extend(y_test)

    # Screen Output for tracking the progress, sometimes I wait too long......
    print('Finish Test Iteration ',i)
    i += 1    

print(classification_report(ALL_TRUE_LABEL,ALL_PRED_LABEL))
print(confusion_matrix(ALL_TRUE_LABEL,ALL_PRED_LABEL))

Finish Test Iteration  0
Finish Test Iteration  1
Finish Test Iteration  2
Finish Test Iteration  3
Finish Test Iteration  4
Finish Test Iteration  5
Finish Test Iteration  6
Finish Test Iteration  7
Finish Test Iteration  8
Finish Test Iteration  9


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      bullet       0.67      0.65      0.66     70728
   irregular       0.82      0.73      0.77    257158
     monthly       0.77      0.84      0.81    342717
      weekly       0.00      0.00      0.00       602

   micro avg       0.78      0.78      0.78    671205
   macro avg       0.57      0.56      0.56    671205
weighted avg       0.78      0.78      0.78    671205

[[ 45943   5301  19484      0]
 [  4827 188435  63896      0]
 [ 17581  36624 288512      0]
 [     0     16    586      0]]


In [46]:
model = LogisticRegression(multi_class='ovr', solver='lbfgs')

ALL_TRUE_LABEL = []
ALL_PRED_LABEL = []
kf = KFold(n_splits=10)
i = 0

for train_index, test_index in kf.split(X):    

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train,y_train)
    ALL_PRED_LABEL.extend(model.predict(X_test))
    ALL_TRUE_LABEL.extend(y_test)

    # Screen Output for tracking the progress, sometimes I wait too long......
    print('Finish Test Iteration ',i)
    i += 1    

print(classification_report(ALL_TRUE_LABEL,ALL_PRED_LABEL))
print(confusion_matrix(ALL_TRUE_LABEL,ALL_PRED_LABEL))



Finish Test Iteration  0




Finish Test Iteration  1




Finish Test Iteration  2




Finish Test Iteration  3




Finish Test Iteration  4




Finish Test Iteration  5




Finish Test Iteration  6




Finish Test Iteration  7




Finish Test Iteration  8




Finish Test Iteration  9


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      bullet       0.75      0.70      0.72     70728
   irregular       0.85      0.77      0.81    257158
     monthly       0.80      0.87      0.83    342717
      weekly       0.00      0.00      0.00       602

   micro avg       0.81      0.81      0.81    671205
   macro avg       0.60      0.58      0.59    671205
weighted avg       0.81      0.81      0.81    671205

[[ 49194   3840  17694      0]
 [  2678 197671  56809      0]
 [ 13387  30532 298798      0]
 [     0     31    571      0]]


In [47]:
model = DTC()

ALL_TRUE_LABEL = []
ALL_PRED_LABEL = []
kf = KFold(n_splits=10)
i = 0

for train_index, test_index in kf.split(X):    

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train,y_train)
    ALL_PRED_LABEL.extend(model.predict(X_test))
    ALL_TRUE_LABEL.extend(y_test)

    # Screen Output for tracking the progress, sometimes I wait too long......
    print('Finish Test Iteration ',i)
    i += 1    

print(classification_report(ALL_TRUE_LABEL,ALL_PRED_LABEL))
print(confusion_matrix(ALL_TRUE_LABEL,ALL_PRED_LABEL))

Finish Test Iteration  0
Finish Test Iteration  1
Finish Test Iteration  2
Finish Test Iteration  3
Finish Test Iteration  4
Finish Test Iteration  5
Finish Test Iteration  6
Finish Test Iteration  7
Finish Test Iteration  8
Finish Test Iteration  9
              precision    recall  f1-score   support

      bullet       0.84      0.85      0.85     70728
   irregular       0.89      0.91      0.90    257158
     monthly       0.92      0.90      0.91    342717
      weekly       0.73      0.70      0.72       602

   micro avg       0.90      0.90      0.90    671205
   macro avg       0.85      0.84      0.84    671205
weighted avg       0.90      0.90      0.90    671205

[[ 60014   2584   8120     10]
 [  2453 234912  19677    116]
 [  8749  24933 309006     29]
 [     3    149     26    424]]


In [48]:
model = RandomForestClassifier(n_estimators=10, max_depth=3)

ALL_TRUE_LABEL = []
ALL_PRED_LABEL = []
kf = KFold(n_splits=10)
i = 0

for train_index, test_index in kf.split(X):    

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train,y_train)
    ALL_PRED_LABEL.extend(model.predict(X_test))
    ALL_TRUE_LABEL.extend(y_test)

    # Screen Output for tracking the progress, sometimes I wait too long......
    print('Finish Test Iteration ',i)
    i += 1    

print(classification_report(ALL_TRUE_LABEL,ALL_PRED_LABEL))
print(confusion_matrix(ALL_TRUE_LABEL,ALL_PRED_LABEL))

Finish Test Iteration  0
Finish Test Iteration  1
Finish Test Iteration  2
Finish Test Iteration  3
Finish Test Iteration  4
Finish Test Iteration  5
Finish Test Iteration  6
Finish Test Iteration  7
Finish Test Iteration  8
Finish Test Iteration  9


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      bullet       1.00      0.03      0.07     70728
   irregular       0.86      0.60      0.70    257158
     monthly       0.65      0.94      0.77    342717
      weekly       0.00      0.00      0.00       602

   micro avg       0.71      0.71      0.71    671205
   macro avg       0.63      0.39      0.38    671205
weighted avg       0.77      0.71      0.67    671205

[[  2428   2635  65665      0]
 [     0 153012 104146      0]
 [     0  22259 320458      0]
 [     0    183    419      0]]
