In [44]:
# Import library

import pandas as pd 
import numpy as np

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import   MinMaxScaler

In [32]:
df = pd.read_csv('processed_data.csv')
df.head()

Unnamed: 0,duration,goal_usd,status,usd_pledged,US_based,comics,crafts,dance,design,fashion,...,3,4,5,6,7,8,9,10,11,12
0,16.0,2000.0,1,6061.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,30.0,3870.99771,1,3914.50512,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,30.0,1100.0,1,1110.0,1,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,45.0,3500.0,1,4807.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,60.0,30000.0,1,40368.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
# drop 'usd_pledged' column 
df.drop ('usd_pledged', axis=1, inplace = True)

# train-test-split
y = df['status']
X = df.drop ('status', axis=1)
x_train, x_test, y_train, y_test = train_test_split(X,y,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

In [34]:
# Min-max normalization scaling for independent variable as model like logistic regression 
# use a weighted sum of input variables will be affected

class_scl = MinMaxScaler()
x_train = class_scl.fit_transform(x_train)
x_test = class_scl.transform(x_test)

In [35]:
## Define function for performance result

# Function to print KFold Cross validation performance on train set 
def KFoldresult_5fold(model, x_train, y_train, is_logreg):
    accuracy = cross_val_score (model, x_train,y_train, cv=5)
    print (model)
    print (f'KFolds cross validation: \n {accuracy} \n')
    print (f'Mean accuracy: \n {accuracy.mean()}\n')
    print ('Coefficient of feature: \n' )
    if is_logreg:
        for index, co in enumerate(model.coef_[0]):
            print (f'Feature {index}: {co:.5f}')
    else:
        for index, co in enumerate(model.feature_importances_):
            print (f'Feature {index}: {co:.5f}')
    return accuracy

# Function to return prediction and print prediction result on test set 
def predictionresult(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print (f'Confusion_matrix: \n {confusion_matrix(y_test, y_pred)} \n')
    print (f'Classification report: \n {classification_report(y_test,y_pred)} \n')
    return y_pred

# Function to print out Grid Search parameters: 
def gridsearch(model, parameters, X_train, y_train):
    search = GridSearchCV(model, parameters, n_jobs=-1)
    search.fit(x_train,y_train)
    print(f'Parameter tested: {parameters}')
    print(f'Best Score : {search.best_score_}')
    print(f'Best parameters: {search.best_params_}')
    return search

def performace(y_ture, y_pred):
    return [accuracy_score(y_ture, y_pred), recall_score(y_ture, y_pred), precision_score(y_ture, y_pred), f1_score(y_ture, y_pred)]

# Logistic Regression 

In [36]:
# Create model -log
log = LogisticRegression()
log.fit (x_train,y_train)

KFoldresult_5fold(log, x_train, y_train, True)

LogisticRegression()
KFolds cross validation: 
 [0.64930263 0.65278032 0.65709997 0.65585533 0.65284083] 

Mean accuracy: 
 0.6535758160912685

Coefficient of feature: 

Feature 0: -2.35682
Feature 1: -7.41670
Feature 2: 0.12108
Feature 3: 1.12745
Feature 4: -0.58436
Feature 5: 1.13096
Feature 6: 0.28714
Feature 7: -0.03949
Feature 8: 0.11382
Feature 9: -1.09049
Feature 10: 0.13950
Feature 11: -1.07439
Feature 12: 0.48318
Feature 13: -0.53232
Feature 14: 0.58867
Feature 15: -0.77669
Feature 16: 0.30255
Feature 17: 0.03061
Feature 18: 0.15187
Feature 19: 0.09966
Feature 20: 0.04089
Feature 21: 0.02668
Feature 22: -0.12479
Feature 23: -0.08045
Feature 24: 0.06201
Feature 25: 0.12457
Feature 26: 0.09394
Feature 27: -0.03656


array([0.64930263, 0.65278032, 0.65709997, 0.65585533, 0.65284083])

In [37]:
log_y_predict = predictionresult(log, x_test, y_test)

Confusion_matrix: 
 [[ 7045  7857]
 [ 3897 15347]] 

Classification report: 
               precision    recall  f1-score   support

           0       0.64      0.47      0.55     14902
           1       0.66      0.80      0.72     19244

    accuracy                           0.66     34146
   macro avg       0.65      0.64      0.63     34146
weighted avg       0.65      0.66      0.65     34146
 



In [38]:
# Optimizing hyperparameters
param = {'C':np.linspace(0.1,1,10), 'penalty': ['l1', 'l2']} 
gridsearch (log, param, x_train, y_train)

        nan 0.65345867        nan 0.65351724        nan 0.65350992
        nan 0.65352457        nan 0.65355385        nan 0.65356117
        nan 0.65357582]


Parameter tested: {'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'penalty': ['l1', 'l2']}
Best Score : 0.6535758160912685
Best parameters: {'C': 1.0, 'penalty': 'l2'}


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'penalty': ['l1', 'l2']})

#### Apply the best parameters {'C': 1.0, 'penalty': 'l2'} 
 - best parameters is same as default parameters for model 'log'

## Random Forest Classifier

In [39]:
ranforest = RandomForestClassifier(random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)

RandomForestClassifier(n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.63810082 0.64059011 0.64187136 0.64454369 0.64072339] 

Mean accuracy: 
 0.6411658727048033

Coefficient of feature: 

Feature 0: 0.25277
Feature 1: 0.53021
Feature 2: 0.01631
Feature 3: 0.00955
Feature 4: 0.00642
Feature 5: 0.00506
Feature 6: 0.00517
Feature 7: 0.00474
Feature 8: 0.00745
Feature 9: 0.02483
Feature 10: 0.00573
Feature 11: 0.00902
Feature 12: 0.00889
Feature 13: 0.00667
Feature 14: 0.00868
Feature 15: 0.01842
Feature 16: 0.00371
Feature 17: 0.00698
Feature 18: 0.00647
Feature 19: 0.00693
Feature 20: 0.00754
Feature 21: 0.00739
Feature 22: 0.00662
Feature 23: 0.00646
Feature 24: 0.00723
Feature 25: 0.00712
Feature 26: 0.00730
Feature 27: 0.00634


array([0.63810082, 0.64059011, 0.64187136, 0.64454369, 0.64072339])

In [40]:
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

Confusion_matrix: 
 [[ 8264  6638]
 [ 5480 13764]] 

Classification report: 
               precision    recall  f1-score   support

           0       0.60      0.55      0.58     14902
           1       0.67      0.72      0.69     19244

    accuracy                           0.65     34146
   macro avg       0.64      0.63      0.64     34146
weighted avg       0.64      0.65      0.64     34146
 



In [41]:
param = {'max_depth': list(range(1,10))}
gridsearch (ranforest, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
Best Score : 0.6731535057613895
Best parameters: {'max_depth': 9}


GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1, param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9]})

In [42]:
ranforest = RandomForestClassifier(max_depth=9, random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)
ranforest_y_predict = predictionresult(ranforest, x_test, y_test) 

RandomForestClassifier(max_depth=9, n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.67126698 0.67196251 0.67467145 0.67628217 0.67158442] 

Mean accuracy: 
 0.6731535057613895

Coefficient of feature: 

Feature 0: 0.17911
Feature 1: 0.36724
Feature 2: 0.01338
Feature 3: 0.04094
Feature 4: 0.01569
Feature 5: 0.01333
Feature 6: 0.00413
Feature 7: 0.00092
Feature 8: 0.00425
Feature 9: 0.13688
Feature 10: 0.00529
Feature 11: 0.03308
Feature 12: 0.02948
Feature 13: 0.01475
Feature 14: 0.03413
Feature 15: 0.09151
Feature 16: 0.00128
Feature 17: 0.00102
Feature 18: 0.00144
Feature 19: 0.00097
Feature 20: 0.00103
Feature 21: 0.00078
Feature 22: 0.00310
Feature 23: 0.00125
Feature 24: 0.00093
Feature 25: 0.00129
Feature 26: 0.00109
Feature 27: 0.00172
Confusion_matrix: 
 [[ 6944  7958]
 [ 3192 16052]] 

Classification report: 
               precision    recall  f1-score   support

           0       0.69      0.47      0.55     14902
           1       0.67      0.83      0.74     19

## XGBoost 

In [45]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1 )
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
KFolds cross validation: 
 [0.68572684 0.68429915 0.68810631 0.68953399 0.68520281] 

Mean accuracy: 
 0.6865738199861737

Coefficient of feature: 

Feature 0: 0.02391
Feature 1: 0.02619
Feature 2: 0.01533
Feature 3: 0.10807
Feature 4: 0.08634
Feature 5: 0.07641
Feature 6: 0.02733
Feature 7: 0.01246
Feature 8: 0.01849
Feature 9: 0.11794
Feature 10: 0.0

array([0.68572684, 0.68429915, 0.68810631, 0.68953399, 0.68520281])

In [46]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[ 8001  6901]
 [ 3662 15582]] 

Classification report: 
               precision    recall  f1-score   support

           0       0.69      0.54      0.60     14902
           1       0.69      0.81      0.75     19244

    accuracy                           0.69     34146
   macro avg       0.69      0.67      0.67     34146
weighted avg       0.69      0.69      0.68     34146
 





In [48]:
# optimizing hyperparameter 
param = {'max_depth': list(range(1,10))}
gridsearch (xgmodel, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
Best Score : 0.6878477581345602
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eval_metric='mlogloss',
                                     gamma=0, gpu_id=-1, importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact',
                          

In [49]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1, max_depth = 4)
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
KFolds cross validation: 
 [0.68664202 0.68730095 0.68828934 0.69096167 0.68604481] 

Mean accuracy: 
 0.6878477581345602

Coefficient of feature: 

Feature 0: 0.03512
Feature 1: 0.03663
Feature 2: 0.01710
Feature 3: 0.08387
Feature 4: 0.12428
Feature 5: 0.04718
Feature 6: 0.02220
Feature 7: 0.01111
Feature 8: 0.01455
Feature 9: 0.14623
Feature 10: 0.0

array([0.68664202, 0.68730095, 0.68828934, 0.69096167, 0.68604481])

In [50]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[ 7811  7091]
 [ 3515 15729]] 

Classification report: 
               precision    recall  f1-score   support

           0       0.69      0.52      0.60     14902
           1       0.69      0.82      0.75     19244

    accuracy                           0.69     34146
   macro avg       0.69      0.67      0.67     34146
weighted avg       0.69      0.69      0.68     34146
 





In [51]:
log_score = performace(y_test, log_y_predict)
rf_score = performace(y_test, ranforest_y_predict)
xg_score = performace(y_test, xgmodel_y_predict)

models_scores_table = pd.DataFrame({'Logistic Regression': log_score, 'Random Forest Classifier': rf_score, 'XGBoost':xg_score},
                                    index=['Accuracy', 'Recall', 'Precision', 'F1 Score'])

models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)

models_scores_table

Unnamed: 0,Logistic Regression,Random Forest Classifier,XGBoost,Best Score
Accuracy,0.655772,0.673461,0.689393,XGBoost
Recall,0.797495,0.83413,0.817346,Random Forest Classifier
Precision,0.661395,0.668555,0.689264,XGBoost
F1 Score,0.723096,0.74222,0.74786,XGBoost
