In [2]:
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

from joblib import dump, load

In [3]:
train_df = pd.read_pickle('data/KS_train_data.pkl')

train_df = train_df.set_index('id', drop = True)

test_df = pd.read_pickle('data/KS_test_data.pkl')
test_df = test_df.set_index('id', drop = True)

In [4]:
X_train = train_df.drop(columns = ['state', 'deadline', 'launched_at', 'converted_pledged_amount', 'spotlight', 'backers_count'])
y_train = train_df.state

X_test = test_df.drop(columns = ['state', 'deadline', 'launched_at', 'converted_pledged_amount', 'spotlight', 'backers_count'])
y_test = test_df.state

In [5]:
X_train.head(3)

Unnamed: 0_level_0,country,currency,disable_communication,goal,is_starrable,staff_pick,category_parent_name,blurb_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1607951133,3,1,False,10000.0,False,True,7,54
720792194,21,13,False,5000.0,False,False,10,133
618798036,8,4,False,5000.0,False,False,4,115


In [20]:
%%time

pipe = make_pipeline(StandardScaler(), DecisionTreeClassifier())

pipe.fit(X_train, y_train)
y_train_preds = pipe.predict(X_train)
y_test_preds = pipe.predict(X_test)
print(classification_report(y_train, y_train_preds))
print(classification_report(y_test, y_test_preds))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88     36526
           1       0.92      0.89      0.91     49844

    accuracy                           0.89     86370
   macro avg       0.89      0.89      0.89     86370
weighted avg       0.89      0.89      0.89     86370

              precision    recall  f1-score   support

           0       0.56      0.59      0.58     36261
           1       0.69      0.67      0.68     50109

    accuracy                           0.64     86370
   macro avg       0.63      0.63      0.63     86370
weighted avg       0.64      0.64      0.64     86370

Wall time: 1.58 s


In [21]:
%%time

pipe = make_pipeline(StandardScaler(), RandomForestClassifier())

pipe.fit(X_train, y_train)
y_train_preds = pipe.predict(X_train)
y_test_preds = pipe.predict(X_test)
print(classification_report(y_train, y_train_preds))
print(classification_report(y_test, y_test_preds))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87     36526
           1       0.89      0.92      0.91     49844

    accuracy                           0.89     86370
   macro avg       0.89      0.89      0.89     86370
weighted avg       0.89      0.89      0.89     86370

              precision    recall  f1-score   support

           0       0.61      0.57      0.59     36261
           1       0.70      0.73      0.72     50109

    accuracy                           0.67     86370
   macro avg       0.66      0.65      0.65     86370
weighted avg       0.66      0.67      0.66     86370

Wall time: 29.9 s
