In [1]:
# Import library

import pandas as pd 
import numpy as np
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

## Below cell: read the cleaned up dataframe by Stephy and Sanjay

In [2]:
df = pd.read_csv('data\processed_data(by_quarter).csv')
df.drop(['Unnamed: 0', 'goal_usd'], axis = 1, inplace = True)
df.head()

Unnamed: 0,duration,blurb_length,name_length,usd_pledged,success,US based,main_category_comics,main_category_crafts,main_category_dance,main_category_design,...,main_category_games,main_category_journalism,main_category_music,main_category_photography,main_category_publishing,main_category_technology,main_category_theater,start_Q_Q2,start_Q_Q3,start_Q_Q4
0,0.163043,0.382353,0.230769,0.000705,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.315217,0.676471,0.269231,0.000455,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.315217,0.588235,0.230769,0.000129,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.478261,0.411765,0.192308,0.000559,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.641304,0.411765,0.115385,0.004696,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [5]:
df.shape

(170730, 23)

## Below cell: read the raw dataframe so that there's columns for Main and Sub category

In [36]:
df_cliff = pd.read_csv('data\Kickstarter_projects_Feb19.csv')

### Below cell: Dropping duplicates from raw dataframe so the rows from cleanup dataframe and raw dataframe match up

In [37]:
df_cliff.drop_duplicates(keep='first', inplace = True)

## Below cell:
### 1. create a new column called main_sub_category that is a merge of main_category and sub_category
### 2. OneHotEncode the "main_sub_category" column
### 3. Name the newly encoded columns (169 of them) by OHE.get_feature_names()
### 4. Merge the encoded columns with the cleanup dateframe from Stephy and Sanjay

In [40]:
# Making a new column from main_category and sub_category,  this way, we preserve the uniqueness of each sub_category, since 'Comedy' sub_category appears in 2 main_category(s)
df_cliff['main_sub_category'] = df_cliff['main_category'] + ' ' + df_cliff['sub_category']

from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder(sparse = False)   # add drop = 'first'  if you wanna drop first encoded column
x = OHE.fit_transform(df1[['main_sub_category']])

# Below, I am naming the new encoded columns using the OHE.get_feature_names() method since in the documentation example, the appearance of the encoded columns are sorted, and get_feature_names is also sorted, and the dimension matches up.

df_one_hot_cat = pd.DataFrame(x, columns = OHE.get_feature_names())
df_final = pd.concat([df,df_one_hot_cat], axis = 1)

In [41]:
df_final

Unnamed: 0,duration,blurb_length,name_length,usd_pledged,success,US based,main_category_comics,main_category_crafts,main_category_dance,main_category_design,...,x0_technology Wearables,x0_technology Web,x0_theater Comedy,x0_theater Experimental,x0_theater Festivals,x0_theater Immersive,x0_theater Musical,x0_theater Plays,x0_theater Spaces,x0_theater Theater
0,0.163043,0.382353,0.230769,7.050565e-04,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.315217,0.676471,0.269231,4.553617e-04,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.315217,0.588235,0.230769,1.291227e-04,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.478261,0.411765,0.192308,5.591827e-04,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.641304,0.411765,0.115385,4.695878e-03,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170725,0.641304,0.588235,0.076923,1.424143e-07,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170726,0.500000,0.500000,0.038462,1.829517e-05,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170727,0.315217,0.352941,0.076923,0.000000e+00,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170728,0.641304,0.058824,0.076923,1.663473e-05,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df.columns

Index(['duration', 'blurb_length', 'name_length', 'usd_pledged', 'success',
       'US based', 'main_category_comics', 'main_category_crafts',
       'main_category_dance', 'main_category_design', 'main_category_fashion',
       'main_category_film & video', 'main_category_food',
       'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater', 'start_Q_Q2', 'start_Q_Q3', 'start_Q_Q4'],
      dtype='object')

## Below cell: Dropping the columns prefixed by "main_category_" so that there's only main_sub_category columns left for analysis

In [43]:
df_final = df_final.drop(['main_category_comics', 'main_category_crafts',
       'main_category_dance', 'main_category_design', 'main_category_fashion',
       'main_category_film & video', 'main_category_food',
       'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater'], axis = 1)

## Below cell: No changes

In [44]:
## Define function for performance result

# Function to print KFold Cross validation performance on train set 
def KFoldresult_5fold(model, x_train, y_train, is_logreg):
    accuracy = cross_val_score (model, x_train,y_train, cv=5)
    print (model)
    print (f'KFolds cross validation: \n {accuracy} \n')
    print (f'Mean accuracy: \n {accuracy.mean()}\n')
    print ('Coefficient of feature: \n' )
    if is_logreg:
        for index, co in enumerate(model.coef_[0]):
            print (f'Feature {index}: {co:.5f}')
    else:
        for index, co in enumerate(model.feature_importances_):
            print (f'Feature {index}: {co:.5f}')
    return accuracy

# Function to return prediction and print prediction result on test set 
def predictionresult(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print (f'Confusion_matrix: \n {confusion_matrix(y_test, y_pred)} \n')
    print (f'Classification report: \n {classification_report(y_test,y_pred)} \n')
    return y_pred

# Function to print out Grid Search parameters: 
def gridsearch(model, parameters, X_train, y_train):
    search = GridSearchCV(model, parameters, n_jobs=-1)
    search.fit(x_train,y_train)
    print(f'Parameter tested: {parameters}')
    print(f'Best Score : {search.best_score_}')
    print(f'Best parameters: {search.best_params_}')
    return search

def performace(y_ture, y_pred):
    return [accuracy_score(y_ture, y_pred), recall_score(y_ture, y_pred), precision_score(y_ture, y_pred), f1_score(y_ture, y_pred)]

## Below cell: splitting the "new" dataframe into train and test sets

In [47]:
# Split train set and test set 
y_sub = df_final['success']
x_sub = df_final.drop('success',axis=1)
x_train_sub, x_test_sub, y_train_sub, y_test_sub = train_test_split(x_sub,y_sub,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

In [48]:
# Split train set and test set 
y = df['success']
x = df.drop('success',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

## Logistic Regression (log)
#### -Develop model

## Below cell: Log Regression with main_sub_category

In [49]:
# Create model -log
log = LogisticRegression()
log.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(log, x_train_sub, y_train_sub, True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression()
KFolds cross validation: 
 [0.75535381 0.76146722 0.7553172  0.75820917 0.75973788] 

Mean accuracy: 
 0.7580170570551858

Coefficient of feature: 

Feature 0: -2.15312
Feature 1: -0.39134
Feature 2: 2.48987
Feature 3: 27.63620
Feature 4: 0.17335
Feature 5: -0.03469
Feature 6: -0.13815
Feature 7: -0.07063
Feature 8: 4.87572
Feature 9: -0.55883
Feature 10: -0.92711
Feature 11: -1.05971
Feature 12: 2.53458
Feature 13: -0.44769
Feature 14: -0.71395
Feature 15: -0.49055
Feature 16: -0.61347
Feature 17: 0.21606
Feature 18: -0.83355
Feature 19: -1.03108
Feature 20: -1.29527
Feature 21: 0.70158
Feature 22: 4.95603
Feature 23: 5.01829
Feature 24: -0.85168
Feature 25: 0.02257
Feature 26: -0.08616
Feature 27: -2.16046
Feature 28: 4.84460
Feature 29: -1.46840
Feature 30: -1.77637
Feature 31: -1.54220
Feature 32: -1.09518
Feature 33: -0.43270
Feature 34: -0.24174
Feature 35: -1.67983
Feature 36: -2.47096
Feature 37: -0.97163
Feature 38: -0.61173
Feature 39: -1.03584
Feature 40

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.75535381, 0.76146722, 0.7553172 , 0.75820917, 0.75973788])

## Below cell: Log Regression with main_categories only

In [12]:
# Create model -log
log = LogisticRegression()
log.fit (x_train,y_train)

KFoldresult_5fold(log, x_train, y_train, True)

LogisticRegression()
KFolds cross validation: 
 [0.66830179 0.67141341 0.67459824 0.6784786  0.6715112 ] 

Mean accuracy: 
 0.6728606472980359

Coefficient of feature: 

Feature 0: -2.39444
Feature 1: -0.67378
Feature 2: 2.93481
Feature 3: 43.04130
Feature 4: 0.10109
Feature 5: 1.12956
Feature 6: -0.56457
Feature 7: 1.15761
Feature 8: 0.11070
Feature 9: -0.14227
Feature 10: 0.17703
Feature 11: -1.10396
Feature 12: 0.07818
Feature 13: -1.04814
Feature 14: 0.44949
Feature 15: -0.53562
Feature 16: 0.54856
Feature 17: -0.98039
Feature 18: 0.30545
Feature 19: -0.02547
Feature 20: -0.12572
Feature 21: -0.00833


array([0.66830179, 0.67141341, 0.67459824, 0.6784786 , 0.6715112 ])

## Below cell: Results with sub_category

In [50]:
log_y_predict_sub = predictionresult(log, x_test_sub, y_test_sub)

Confusion_matrix: 
 [[11671  3231]
 [ 4934 14310]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.70      0.78      0.74     14902
         1.0       0.82      0.74      0.78     19244

    accuracy                           0.76     34146
   macro avg       0.76      0.76      0.76     34146
weighted avg       0.77      0.76      0.76     34146
 



## Below cell: Results with main_category only

In [6]:
log_y_predict = predictionresult(log, x_test, y_test)

Confusion_matrix: 
 [[ 7652  7250]
 [ 3792 15452]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.67      0.51      0.58     14902
         1.0       0.68      0.80      0.74     19244

    accuracy                           0.68     34146
   macro avg       0.67      0.66      0.66     34146
weighted avg       0.68      0.68      0.67     34146
 



#### -Optimizing hyperparameters

## Below cell: optimizing the regressor trained by the sub_category dataframe

In [51]:
param = {'C':np.linspace(0.1,1,10), 'penalty': ['l1', 'l2']} 
gridsearch (log, param, x_train_sub, y_train_sub)

Parameter tested: {'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'penalty': ['l1', 'l2']}
Best Score : 0.6728606472980359
Best parameters: {'C': 1.0, 'penalty': 'l2'}


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'penalty': ['l1', 'l2']})

## Below cell: no changes

In [7]:
param = {'C':np.linspace(0.1,1,10), 'penalty': ['l1', 'l2']} 
gridsearch (log, param, x_train, y_train)

Parameter tested: {'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'penalty': ['l1', 'l2']}
Best Score : 0.6728606472980359
Best parameters: {'C': 1.0, 'penalty': 'l2'}


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'penalty': ['l1', 'l2']})

#### Apply the best parameters {'C': 1.0, 'penalty': 'l2'} 
 - best parameters is same as default parameters for model 'log'

## Random Forest Classifier (ranforest)
#### -Develop model 

## Below cell: Random Forest with the sub_categories

In [52]:
ranforest = RandomForestClassifier(random_state = 42, n_jobs=-1)
ranforest.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(ranforest, x_train_sub, y_train_sub, False)

RandomForestClassifier(n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.90079438 0.90225867 0.90233188 0.89852473 0.90133987] 

Mean accuracy: 
 0.9010499054791821

Coefficient of feature: 

Feature 0: 0.05250
Feature 1: 0.05095
Feature 2: 0.04344
Feature 3: 0.57038
Feature 4: 0.00881
Feature 5: 0.00778
Feature 6: 0.00764
Feature 7: 0.00751
Feature 8: 0.00304
Feature 9: 0.00043
Feature 10: 0.00107
Feature 11: 0.00126
Feature 12: 0.00844
Feature 13: 0.00065
Feature 14: 0.00178
Feature 15: 0.00171
Feature 16: 0.00152
Feature 17: 0.00159
Feature 18: 0.00148
Feature 19: 0.00044
Feature 20: 0.00022
Feature 21: 0.00059
Feature 22: 0.00745
Feature 23: 0.00094
Feature 24: 0.00026
Feature 25: 0.00152
Feature 26: 0.00083
Feature 27: 0.00036
Feature 28: 0.00518
Feature 29: 0.00026
Feature 30: 0.00123
Feature 31: 0.00019
Feature 32: 0.00017
Feature 33: 0.00024
Feature 34: 0.00012
Feature 35: 0.00028
Feature 36: 0.00012
Feature 37: 0.00036
Feature 38: 0.00001
Feature 39: 0.00012
Feature 40

array([0.90079438, 0.90225867, 0.90233188, 0.89852473, 0.90133987])

## Below cell: Random Forest with the main_categories

In [13]:
ranforest = RandomForestClassifier(random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)

RandomForestClassifier(n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.8791961  0.87886664 0.87974521 0.87674342 0.87897203] 

Mean accuracy: 
 0.8787046818613394

Coefficient of feature: 

Feature 0: 0.07421
Feature 1: 0.07898
Feature 2: 0.05877
Feature 3: 0.68954
Feature 4: 0.00834
Feature 5: 0.00399
Feature 6: 0.00226
Feature 7: 0.00191
Feature 8: 0.00225
Feature 9: 0.00281
Feature 10: 0.00382
Feature 11: 0.01615
Feature 12: 0.00319
Feature 13: 0.00291
Feature 14: 0.00461
Feature 15: 0.00245
Feature 16: 0.00452
Feature 17: 0.01819
Feature 18: 0.00193
Feature 19: 0.00654
Feature 20: 0.00630
Feature 21: 0.00634


array([0.8791961 , 0.87886664, 0.87974521, 0.87674342, 0.87897203])

## Below cell: Random Forest result with sub_categories

In [53]:
ranforest_y_predict = predictionresult(ranforest, x_test_sub, y_test_sub)

Confusion_matrix: 
 [[12896  2006]
 [ 1355 17889]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.90      0.87      0.88     14902
         1.0       0.90      0.93      0.91     19244

    accuracy                           0.90     34146
   macro avg       0.90      0.90      0.90     34146
weighted avg       0.90      0.90      0.90     34146
 



## Below cell: Random Forest result with main_categories

In [14]:
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

Confusion_matrix: 
 [[12331  2571]
 [ 1600 17644]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.89      0.83      0.86     14902
         1.0       0.87      0.92      0.89     19244

    accuracy                           0.88     34146
   macro avg       0.88      0.87      0.87     34146
weighted avg       0.88      0.88      0.88     34146
 



#### -Optimizing Hyperparameters

## Below cell: Random Forest optimizing with sub_categories

In [54]:
param = {'max_depth': list(range(1,5))}
gridsearch (ranforest, param, x_train_sub, y_train_sub)

Parameter tested: {'max_depth': [1, 2, 3, 4]}
Best Score : 0.8648231155969773
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1, param_grid={'max_depth': [1, 2, 3, 4]})

## Below cell: Random Forest optimizing with main_categories

In [15]:
param = {'max_depth': list(range(1,5))}
gridsearch (ranforest, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4]}
Best Score : 0.8648231155969773
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1, param_grid={'max_depth': [1, 2, 3, 4]})

#### Apply the best parameters {'max_depth': 4}

## Below cell: Applying to sub_categories

In [55]:
ranforest = RandomForestClassifier(max_depth=4, random_state = 42, n_jobs=-1)
ranforest.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(ranforest, x_train_sub, y_train_sub, False)
ranforest_y_predict = predictionresult(ranforest, x_test_sub, y_test_sub)

RandomForestClassifier(max_depth=4, n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.8680675  0.87019072 0.86740857 0.86455321 0.86971006] 

Mean accuracy: 
 0.8679860139111911

Coefficient of feature: 

Feature 0: 0.07940
Feature 1: 0.00140
Feature 2: 0.05670
Feature 3: 0.31137
Feature 4: 0.00017
Feature 5: 0.00006
Feature 6: 0.00013
Feature 7: 0.00007
Feature 8: 0.00272
Feature 9: 0.00000
Feature 10: 0.00000
Feature 11: 0.00003
Feature 12: 0.01810
Feature 13: 0.00000
Feature 14: 0.00000
Feature 15: 0.00000
Feature 16: 0.00000
Feature 17: 0.00024
Feature 18: 0.00000
Feature 19: 0.00000
Feature 20: 0.00000
Feature 21: 0.00000
Feature 22: 0.03006
Feature 23: 0.00020
Feature 24: 0.00000
Feature 25: 0.00000
Feature 26: 0.00000
Feature 27: 0.00000
Feature 28: 0.00461
Feature 29: 0.00000
Feature 30: 0.00117
Feature 31: 0.00000
Feature 32: 0.00000
Feature 33: 0.00000
Feature 34: 0.00000
Feature 35: 0.00000
Feature 36: 0.00000
Feature 37: 0.00000
Feature 38: 0.00000
Feature 39: 0.000

## Below cell: Applying to main_categories

In [16]:
ranforest = RandomForestClassifier(max_depth=4, random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

RandomForestClassifier(max_depth=4, n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.86660321 0.86685946 0.86462642 0.86078266 0.86524381] 

Mean accuracy: 
 0.8648231155969773

Coefficient of feature: 

Feature 0: 0.06871
Feature 1: 0.00287
Feature 2: 0.04939
Feature 3: 0.68332
Feature 4: 0.00169
Feature 5: 0.01605
Feature 6: 0.00208
Feature 7: 0.00658
Feature 8: 0.00008
Feature 9: 0.00001
Feature 10: 0.00065
Feature 11: 0.07265
Feature 12: 0.00034
Feature 13: 0.00667
Feature 14: 0.01246
Feature 15: 0.00147
Feature 16: 0.01278
Feature 17: 0.06159
Feature 18: 0.00023
Feature 19: 0.00007
Feature 20: 0.00024
Feature 21: 0.00009
Confusion_matrix: 
 [[11446  3456]
 [ 1148 18096]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.91      0.77      0.83     14902
         1.0       0.84      0.94      0.89     19244

    accuracy                           0.87     34146
   macro avg       0.87      0.85      0.86     34146
weighte

## XG Boost
#### -Develop model

## XG Boost with sub_categories

In [56]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1 )
xgmodel.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(xgmodel, x_train_sub, y_train_sub, False)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
KFolds cross validation: 
 [0.90419885 0.90507742 0.90357653 0.90112384 0.90415873] 

Mean accuracy: 
 0.9036270758775473

Coefficient of feature: 

Feature 0: 0.00569
Feature 1: 0.00062
Feature 2: 0.00056
Feature 3: 0.04737
Feature 4: 0.00120
Feature 5: 0.00092
Feature 6: 0.00074
Feature 7: 0.00077
Feature 8: 0.01387
Feature 9: 0.00000
Feature 10: 0.0

array([0.90419885, 0.90507742, 0.90357653, 0.90112384, 0.90415873])

## XG Boost with main_categories

In [18]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1 )
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
KFolds cross validation: 
 [0.8852363  0.88417469 0.8852363  0.88358897 0.88431688] 

Mean accuracy: 
 0.8845106294020857

Coefficient of feature: 

Feature 0: 0.03108
Feature 1: 0.00756
Feature 2: 0.00699
Feature 3: 0.26823
Feature 4: 0.01556
Feature 5: 0.03867
Feature 6: 0.01115
Feature 7: 0.04227
Feature 8: 0.01742
Feature 9: 0.01967
Feature 10: 0.0

array([0.8852363 , 0.88417469, 0.8852363 , 0.88358897, 0.88431688])

## Below cell: XG Boost results with sub_categories

In [57]:
xgmodel_y_predict = predictionresult(xgmodel, x_test_sub, y_test_sub)

Confusion_matrix: 
 [[12852  2050]
 [ 1288 17956]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.91      0.86      0.89     14902
         1.0       0.90      0.93      0.91     19244

    accuracy                           0.90     34146
   macro avg       0.90      0.90      0.90     34146
weighted avg       0.90      0.90      0.90     34146
 



## Below cell: XG Boost results with main_categories

In [19]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[12297  2605]
 [ 1334 17910]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.90      0.83      0.86     14902
         1.0       0.87      0.93      0.90     19244

    accuracy                           0.88     34146
   macro avg       0.89      0.88      0.88     34146
weighted avg       0.89      0.88      0.88     34146
 



#### -Optimizing Hyperparameters

## Optimizing with sub_categories (did not run, took took long)

In [None]:
param = {'max_depth': list(range(1,5))}
gridsearch (xgmodel, param, x_train_sub, y_train_sub)

## Optimizaing with main_categories

In [20]:
param = {'max_depth': list(range(1,5))}
gridsearch (xgmodel, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4]}
Best Score : 0.8846936661427026
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eval_metric='mlogloss',
                                     gamma=0, gpu_id=-1, importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact',
                          

#### Apply the best parameters {'max_depth': 4}

In [21]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1, max_depth = 4)
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
KFolds cross validation: 
 [0.88611487 0.88530951 0.88501666 0.88263719 0.8843901 ] 

Mean accuracy: 
 0.8846936661427026

Coefficient of feature: 

Feature 0: 0.04288
Feature 1: 0.00569
Feature 2: 0.00476
Feature 3: 0.36035
Feature 4: 0.01319
Feature 5: 0.04188
Feature 6: 0.01294
Feature 7: 0.03399
Feature 8: 0.00997
Feature 9: 0.01678
Feature 10: 0.0

array([0.88611487, 0.88530951, 0.88501666, 0.88263719, 0.8843901 ])

## Below cell: Prediction results with sub_categories

In [None]:
xgmodel_y_predict = predictionresult(xgmodel, x_test_sub, y_test_sub)

## Below cell: Prediction results with main_categories

In [22]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[12231  2671]
 [ 1298 17946]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.90      0.82      0.86     14902
         1.0       0.87      0.93      0.90     19244

    accuracy                           0.88     34146
   macro avg       0.89      0.88      0.88     34146
weighted avg       0.89      0.88      0.88     34146
 



## did not touch the following codes

In [24]:
log_score = performace(y_test, log_y_predict)
rf_score = performace(y_test, ranforest_y_predict)
xg_score = performace(y_test, xgmodel_y_predict)

models_scores_table = pd.DataFrame({'Logistic Regression': log_score, 'Random Forest Classifier': rf_score, 'XGBoost':xg_score},
                                    index=['Accuracy', 'Recall', 'Precision', 'F1 Score'])

models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)

models_scores_table

Unnamed: 0,Logistic Regression,Random Forest Classifier,XGBoost,Best Score
Accuracy,0.676624,0.865167,0.883764,XGBoost
Recall,0.802952,0.940345,0.93255,Random Forest Classifier
Precision,0.680645,0.839644,0.870447,XGBoost
F1 Score,0.736757,0.887146,0.900429,XGBoost
