# First model

In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression

# Set random seed 
RSEED = 42
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data/data_clean_outlier.csv', index_col = 0)
df.head()

Unnamed: 0,backers_count,country,disable_communication,id,is_starrable,name,staff_pick,state,usd_pledged,usd_type,...,phase_before,main_category,subcategory,city,region,city_type,creator_id,creator_num_projects,words_blurb,goal_usd
0,21,US,False,287514992,False,New Final Round Album,0,1,802.0,international,...,4,Music,Rock,Chicago,IL,Town,1495925645,1,26,200.0
1,97,US,False,385129759,False,Princess Pals Enamel Pin Series,0,1,2259.0,international,...,4,Art,Mixed Media,Sacramento,CA,Town,1175589980,4,9,400.0
2,88,US,False,681033598,False,Their Life Through Their Lens-the Amish and Me...,1,1,29638.0,international,...,8,Photography,Photobooks,Columbus,OH,Town,1196856269,2,25,27224.0
3,193,IT,False,1031782682,False,WAO: THE ECO EFFECT SHOES,0,1,49075.15252,international,...,3,Fashion,Footwear,Venice,Veneto,Town,1569700626,2,13,45461.0028
4,20,US,False,904085819,False,Apple Watch Development Course,0,0,549.0,domestic,...,2,Technology,Software,Redmond,WA,Town,1870845385,1,22,1000.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184367 entries, 0 to 209221
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   backers_count          184367 non-null  int64  
 1   country                184367 non-null  object 
 2   disable_communication  184367 non-null  bool   
 3   id                     184367 non-null  int64  
 4   is_starrable           184367 non-null  bool   
 5   name                   184367 non-null  object 
 6   staff_pick             184367 non-null  int64  
 7   state                  184367 non-null  int64  
 8   usd_pledged            184367 non-null  float64
 9   usd_type               184330 non-null  object 
 10  yr_launched            184367 non-null  int64  
 11  mo_launched            184367 non-null  int64  
 12  wd_launched            184367 non-null  int64  
 13  yr_deadline            184367 non-null  int64  
 14  mo_deadline            184367 non-nu

In [4]:
print(df["main_category"].unique())
df[df["main_category"].isna()].head()

['Music' 'Art' 'Photography' 'Fashion' 'Technology' 'Publishing' 'Games'
 'Food' 'Theater' nan 'Dance' 'Journalism' 'Film & Video' 'Comics'
 'Crafts' 'Design']


Unnamed: 0,backers_count,country,disable_communication,id,is_starrable,name,staff_pick,state,usd_pledged,usd_type,...,phase_before,main_category,subcategory,city,region,city_type,creator_id,creator_num_projects,words_blurb,goal_usd
31,53,GB,False,351413763,False,A Girl in School Uniform (Walks Into A Bar),0,1,1532.52672,international,...,0,,Theater,Leeds,England,Town,423881543,1,10,1238.40543
51,83,AT,False,1632402245,False,Herr Michl - the fastest camera bag ever,0,1,11834.75366,international,...,9,,Photography,Vienna,Vienna,Town,2083385385,3,17,5643.65935
76,10,US,False,1469717703,False,Savor the Seasons: a Michigan cookbook,0,1,779.0,international,...,0,,Photography,Ferndale,MI,Town,1120504471,1,16,550.0
85,26,GB,False,94326304,False,Match funding for workshop of Evan Placey’s Mo...,0,1,927.279344,international,...,15,,Theater,London,England,Town,884315621,1,18,925.956549
95,328,US,False,1998657797,False,Lay under the best Piano,0,1,30055.0,international,...,0,,Music,New York,NY,Town,609549911,2,20,30000.0


In [5]:
df["main_category"].fillna(df["subcategory"], inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184367 entries, 0 to 209221
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   backers_count          184367 non-null  int64  
 1   country                184367 non-null  object 
 2   disable_communication  184367 non-null  bool   
 3   id                     184367 non-null  int64  
 4   is_starrable           184367 non-null  bool   
 5   name                   184367 non-null  object 
 6   staff_pick             184367 non-null  int64  
 7   state                  184367 non-null  int64  
 8   usd_pledged            184367 non-null  float64
 9   usd_type               184330 non-null  object 
 10  yr_launched            184367 non-null  int64  
 11  mo_launched            184367 non-null  int64  
 12  wd_launched            184367 non-null  int64  
 13  yr_deadline            184367 non-null  int64  
 14  mo_deadline            184367 non-nu

In [7]:
df.drop(['id', 'name', 'yr_deadline',"mo_deadline", "wd_deadline"], axis=1, inplace=True)
df.columns

Index(['backers_count', 'country', 'disable_communication', 'is_starrable',
       'staff_pick', 'state', 'usd_pledged', 'usd_type', 'yr_launched',
       'mo_launched', 'wd_launched', 'founding_phase', 'phase_before',
       'main_category', 'subcategory', 'city', 'region', 'city_type',
       'creator_id', 'creator_num_projects', 'words_blurb', 'goal_usd'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184367 entries, 0 to 209221
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   backers_count          184367 non-null  int64  
 1   country                184367 non-null  object 
 2   disable_communication  184367 non-null  bool   
 3   is_starrable           184367 non-null  bool   
 4   staff_pick             184367 non-null  int64  
 5   state                  184367 non-null  int64  
 6   usd_pledged            184367 non-null  float64
 7   usd_type               184330 non-null  object 
 8   yr_launched            184367 non-null  int64  
 9   mo_launched            184367 non-null  int64  
 10  wd_launched            184367 non-null  int64  
 11  founding_phase         184367 non-null  int64  
 12  phase_before           184367 non-null  int64  
 13  main_category          184367 non-null  object 
 14  subcategory            184367 non-nu

In [9]:
df.dropna(inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184115 entries, 0 to 209221
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   backers_count          184115 non-null  int64  
 1   country                184115 non-null  object 
 2   disable_communication  184115 non-null  bool   
 3   is_starrable           184115 non-null  bool   
 4   staff_pick             184115 non-null  int64  
 5   state                  184115 non-null  int64  
 6   usd_pledged            184115 non-null  float64
 7   usd_type               184115 non-null  object 
 8   yr_launched            184115 non-null  int64  
 9   mo_launched            184115 non-null  int64  
 10  wd_launched            184115 non-null  int64  
 11  founding_phase         184115 non-null  int64  
 12  phase_before           184115 non-null  int64  
 13  main_category          184115 non-null  object 
 14  subcategory            184115 non-nu

In [10]:
cat_features = list(df.columns[df.dtypes==object])

num_features = list(df.columns[df.dtypes!=object])

num_features.remove('usd_pledged')
num_features.remove("backers_count")
num_features.remove("state")

In [11]:
# Define predictors and target variable

X = df.drop(['usd_pledged', "backers_count", "state"], axis=1)
y = df['state']

print(f"We have {X.shape[0]} observations in our dataset and {X.shape[1]} features")
print(f"Our target vector has also {y.shape[0]} values")

We have 184115 observations in our dataset and 19 features
Our target vector has also 184115 values


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

In [13]:
num_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [14]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [15]:
pipe_logreg = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [16]:
y_train_predicted = cross_val_predict(pipe_logreg, X_train, y_train, cv=5)

In [17]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_train, y_train_predicted)))
print("Recall: {:.2f}".format(recall_score(y_train, y_train_predicted)))
print("Precision: {:.2f}".format(precision_score(y_train, y_train_predicted)))

Cross validation scores:
-------------------------
Accuracy: 0.81
Recall: 0.83
Precision: 0.86


In [18]:
cm = confusion_matrix(y_train, y_train_predicted)
cm

array([[5483, 1567],
       [1892, 9469]])

In [19]:
pipe_logreg.fit(X_train, y_train)
y_pred = pipe_logreg.predict(X_test)

In [20]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))
print("Recall: {:.2f}".format(recall_score(y_test, y_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, y_pred)))

Cross validation scores:
-------------------------
Accuracy: 0.82
Recall: 0.84
Precision: 0.86


In [21]:
cm1 = confusion_matrix(y_test, y_pred)
cm1

array([[50463, 13859],
       [16093, 85289]])

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [51]:
models = [DecisionTreeClassifier(max_depth=6, random_state=RSEED ),
        RandomForestClassifier(n_estimators=100,max_depth=6 , random_state=RSEED, n_jobs = -1), 
        ExtraTreesClassifier(n_estimators=100,random_state=RSEED, n_jobs = -1),
        AdaBoostClassifier(random_state=RSEED),        
        xgb.XGBClassifier(),
        RidgeClassifier(),
        KNeighborsClassifier(n_jobs = -1),
        LogisticRegression(max_iter=1000)]

model_name = ["DecisionTree", "RandomForest", "ExtraTrees","AdaBoost","XGBoost", "RidgeRegression", "KNN", "LogisticRegression"]
accuracy = []
recall = []
precision = []
accuracy_test = []
recall_test = []
precision_test = []

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

k = len(models)

for i in range(k):
    model = models[i].fit(X_train_prep, y_train)
    y_pred = models[i].predict(X_train_prep)
    y_pred_test = models[i].predict(X_test_prep)
    accuracy.append(accuracy_score(y_train, y_pred))
    recall.append(recall_score(y_train, y_pred))
    precision.append(precision_score(y_train, y_pred))
    accuracy_test.append(accuracy_score(y_test, y_pred_test))
    recall_test.append(recall_score(y_test, y_pred_test))
    precision_test.append(precision_score(y_test, y_pred_test))

df_scores = pd.DataFrame({"name": model_name,
                         "accuracy_train": accuracy,
                         "recall_train": recall,
                         "precission_train": precision,
                         "accuracy_test": accuracy_test,
                         "recall_test": recall_test,
                         "precission_test": precision_test,
                         })


In [52]:
df_scores.sort_values("accuracy_test", ascending = False).head(20)

Unnamed: 0,name,accuracy_train,recall_train,precission_train,accuracy_test,recall_test,precission_test
4,XGBoost,0.9,0.91,0.93,0.87,0.88,0.9
3,AdaBoost,0.84,0.86,0.88,0.84,0.86,0.88
2,ExtraTrees,1.0,1.0,1.0,0.82,0.86,0.85
7,LogisticRegression,0.85,0.87,0.89,0.82,0.84,0.86
5,RidgeRegression,0.85,0.88,0.88,0.79,0.83,0.83
0,DecisionTree,0.78,0.86,0.8,0.77,0.85,0.79
6,KNN,0.84,0.9,0.85,0.76,0.84,0.78
1,RandomForest,0.62,1.0,0.62,0.61,1.0,0.61
