In [1]:
# Import library

import pandas as pd 
import numpy as np
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data\processed_data(by_quarter).csv')
df.drop(['Unnamed: 0', 'goal_usd'], axis = 1, inplace = True)
df.head()

Unnamed: 0,duration,blurb_length,name_length,usd_pledged,success,US based,main_category_comics,main_category_crafts,main_category_dance,main_category_design,...,main_category_games,main_category_journalism,main_category_music,main_category_photography,main_category_publishing,main_category_technology,main_category_theater,start_Q_Q2,start_Q_Q3,start_Q_Q4
0,0.163043,0.382353,0.230769,0.000705,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.315217,0.676471,0.269231,0.000455,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.315217,0.588235,0.230769,0.000129,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.478261,0.411765,0.192308,0.000559,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.641304,0.411765,0.115385,0.004696,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [11]:
## Define function for performance result

# Function to print KFold Cross validation performance on train set 
def KFoldresult_5fold(model, x_train, y_train, is_logreg):
    accuracy = cross_val_score (model, x_train,y_train, cv=5)
    print (model)
    print (f'KFolds cross validation: \n {accuracy} \n')
    print (f'Mean accuracy: \n {accuracy.mean()}\n')
    print ('Coefficient of feature: \n' )
    if is_logreg:
        for index, co in enumerate(model.coef_[0]):
            print (f'Feature {index}: {co:.5f}')
    else:
        for index, co in enumerate(model.feature_importances_):
            print (f'Feature {index}: {co:.5f}')
    return accuracy

# Function to return prediction and print prediction result on test set 
def predictionresult(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print (f'Confusion_matrix: \n {confusion_matrix(y_test, y_pred)} \n')
    print (f'Classification report: \n {classification_report(y_test,y_pred)} \n')
    return y_pred

# Function to print out Grid Search parameters: 
def gridsearch(model, parameters, X_train, y_train):
    search = GridSearchCV(model, parameters, n_jobs=-1)
    search.fit(x_train,y_train)
    print(f'Parameter tested: {parameters}')
    print(f'Best Score : {search.best_score_}')
    print(f'Best parameters: {search.best_params_}')
    return search

def performace(y_ture, y_pred):
    return [accuracy_score(y_ture, y_pred), recall_score(y_ture, y_pred), precision_score(y_ture, y_pred), f1_score(y_ture, y_pred)]

In [4]:
# Split train set and test set 
y = df['success']
x = df.drop('success',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

## Logistic Regression (log)
#### -Develop model

In [12]:
# Create model -log
log = LogisticRegression()
log.fit (x_train,y_train)

KFoldresult_5fold(log, x_train, y_train, True)

LogisticRegression()
KFolds cross validation: 
 [0.66830179 0.67141341 0.67459824 0.6784786  0.6715112 ] 

Mean accuracy: 
 0.6728606472980359

Coefficient of feature: 

Feature 0: -2.39444
Feature 1: -0.67378
Feature 2: 2.93481
Feature 3: 43.04130
Feature 4: 0.10109
Feature 5: 1.12956
Feature 6: -0.56457
Feature 7: 1.15761
Feature 8: 0.11070
Feature 9: -0.14227
Feature 10: 0.17703
Feature 11: -1.10396
Feature 12: 0.07818
Feature 13: -1.04814
Feature 14: 0.44949
Feature 15: -0.53562
Feature 16: 0.54856
Feature 17: -0.98039
Feature 18: 0.30545
Feature 19: -0.02547
Feature 20: -0.12572
Feature 21: -0.00833


array([0.66830179, 0.67141341, 0.67459824, 0.6784786 , 0.6715112 ])

In [6]:
log_y_predict = predictionresult(log, x_test, y_test)

Confusion_matrix: 
 [[ 7652  7250]
 [ 3792 15452]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.67      0.51      0.58     14902
         1.0       0.68      0.80      0.74     19244

    accuracy                           0.68     34146
   macro avg       0.67      0.66      0.66     34146
weighted avg       0.68      0.68      0.67     34146
 



#### -Optimizing hyperparameters

In [7]:
param = {'C':np.linspace(0.1,1,10), 'penalty': ['l1', 'l2']} 
gridsearch (log, param, x_train, y_train)

Parameter tested: {'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'penalty': ['l1', 'l2']}
Best Score : 0.6728606472980359
Best parameters: {'C': 1.0, 'penalty': 'l2'}


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'penalty': ['l1', 'l2']})

#### Apply the best parameters {'C': 1.0, 'penalty': 'l2'} 
 - best parameters is same as default parameters for model 'log'

## Random Forest Classifier (ranforest)
#### -Develop model 

In [13]:
ranforest = RandomForestClassifier(random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)

RandomForestClassifier(n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.8791961  0.87886664 0.87974521 0.87674342 0.87897203] 

Mean accuracy: 
 0.8787046818613394

Coefficient of feature: 

Feature 0: 0.07421
Feature 1: 0.07898
Feature 2: 0.05877
Feature 3: 0.68954
Feature 4: 0.00834
Feature 5: 0.00399
Feature 6: 0.00226
Feature 7: 0.00191
Feature 8: 0.00225
Feature 9: 0.00281
Feature 10: 0.00382
Feature 11: 0.01615
Feature 12: 0.00319
Feature 13: 0.00291
Feature 14: 0.00461
Feature 15: 0.00245
Feature 16: 0.00452
Feature 17: 0.01819
Feature 18: 0.00193
Feature 19: 0.00654
Feature 20: 0.00630
Feature 21: 0.00634


array([0.8791961 , 0.87886664, 0.87974521, 0.87674342, 0.87897203])

In [14]:
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

Confusion_matrix: 
 [[12331  2571]
 [ 1600 17644]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.89      0.83      0.86     14902
         1.0       0.87      0.92      0.89     19244

    accuracy                           0.88     34146
   macro avg       0.88      0.87      0.87     34146
weighted avg       0.88      0.88      0.88     34146
 



#### -Optimizing Hyperparameters

In [15]:
param = {'max_depth': list(range(1,5))}
gridsearch (ranforest, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4]}
Best Score : 0.8648231155969773
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1, param_grid={'max_depth': [1, 2, 3, 4]})

#### Apply the best parameters {'max_depth': 4}

In [16]:
ranforest = RandomForestClassifier(max_depth=4, random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

RandomForestClassifier(max_depth=4, n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.86660321 0.86685946 0.86462642 0.86078266 0.86524381] 

Mean accuracy: 
 0.8648231155969773

Coefficient of feature: 

Feature 0: 0.06871
Feature 1: 0.00287
Feature 2: 0.04939
Feature 3: 0.68332
Feature 4: 0.00169
Feature 5: 0.01605
Feature 6: 0.00208
Feature 7: 0.00658
Feature 8: 0.00008
Feature 9: 0.00001
Feature 10: 0.00065
Feature 11: 0.07265
Feature 12: 0.00034
Feature 13: 0.00667
Feature 14: 0.01246
Feature 15: 0.00147
Feature 16: 0.01278
Feature 17: 0.06159
Feature 18: 0.00023
Feature 19: 0.00007
Feature 20: 0.00024
Feature 21: 0.00009
Confusion_matrix: 
 [[11446  3456]
 [ 1148 18096]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.91      0.77      0.83     14902
         1.0       0.84      0.94      0.89     19244

    accuracy                           0.87     34146
   macro avg       0.87      0.85      0.86     34146
weighte

## XG Boost
#### -Develop model

In [18]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1 )
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
KFolds cross validation: 
 [0.8852363  0.88417469 0.8852363  0.88358897 0.88431688] 

Mean accuracy: 
 0.8845106294020857

Coefficient of feature: 

Feature 0: 0.03108
Feature 1: 0.00756
Feature 2: 0.00699
Feature 3: 0.26823
Feature 4: 0.01556
Feature 5: 0.03867
Feature 6: 0.01115
Feature 7: 0.04227
Feature 8: 0.01742
Feature 9: 0.01967
Feature 10: 0.0

array([0.8852363 , 0.88417469, 0.8852363 , 0.88358897, 0.88431688])

In [19]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[12297  2605]
 [ 1334 17910]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.90      0.83      0.86     14902
         1.0       0.87      0.93      0.90     19244

    accuracy                           0.88     34146
   macro avg       0.89      0.88      0.88     34146
weighted avg       0.89      0.88      0.88     34146
 



#### -Optimizing Hyperparameters

In [20]:
param = {'max_depth': list(range(1,5))}
gridsearch (xgmodel, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4]}
Best Score : 0.8846936661427026
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eval_metric='mlogloss',
                                     gamma=0, gpu_id=-1, importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact',
                          

#### Apply the best parameters {'max_depth': 4}

In [21]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1, max_depth = 4)
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
KFolds cross validation: 
 [0.88611487 0.88530951 0.88501666 0.88263719 0.8843901 ] 

Mean accuracy: 
 0.8846936661427026

Coefficient of feature: 

Feature 0: 0.04288
Feature 1: 0.00569
Feature 2: 0.00476
Feature 3: 0.36035
Feature 4: 0.01319
Feature 5: 0.04188
Feature 6: 0.01294
Feature 7: 0.03399
Feature 8: 0.00997
Feature 9: 0.01678
Feature 10: 0.0

array([0.88611487, 0.88530951, 0.88501666, 0.88263719, 0.8843901 ])

In [22]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[12231  2671]
 [ 1298 17946]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.90      0.82      0.86     14902
         1.0       0.87      0.93      0.90     19244

    accuracy                           0.88     34146
   macro avg       0.89      0.88      0.88     34146
weighted avg       0.89      0.88      0.88     34146
 



In [24]:
log_score = performace(y_test, log_y_predict)
rf_score = performace(y_test, ranforest_y_predict)
xg_score = performace(y_test, xgmodel_y_predict)

models_scores_table = pd.DataFrame({'Logistic Regression': log_score, 'Random Forest Classifier': rf_score, 'XGBoost':xg_score},
                                    index=['Accuracy', 'Recall', 'Precision', 'F1 Score'])

models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)

models_scores_table

Unnamed: 0,Logistic Regression,Random Forest Classifier,XGBoost,Best Score
Accuracy,0.676624,0.865167,0.883764,XGBoost
Recall,0.802952,0.940345,0.93255,Random Forest Classifier
Precision,0.680645,0.839644,0.870447,XGBoost
F1 Score,0.736757,0.887146,0.900429,XGBoost
