###### Summary
1. Load Modules and Data
2. Initial Exploration
3. Descriptive Statistics
4. Data Preparation
5. Model Development
6. Principal Component Analysis
7. Light GBM v2: Model selection (best hyperparameters) using gridsearchcv
8. Light GBM version 3
9. Model Evaluation

# **1. Load Modules and Data**

In [None]:
import pandas as pd
import numpy as np
import string
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.tools as tls
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import warnings
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import lightgbm as lgb
from lightgbm import LGBMClassifier

# Load data
print('Use my extended dataset containing complexity analysis (NLP) of the NAME feature.')
#df = pd.read_csv('../input/ks-projects-201801.csv',encoding ='latin1')
df = pd.read_csv('../input/kickstarter2018nlp/ks-projects-201801-extra.csv',encoding ='latin1')
print('done')


# 2. Initial Exploration

In [None]:
df.head(5)
df.describe()
print(df.shape)
print(df.info())
print(df.nunique())
print('done')

# 3. Descriptive Statistics

In [None]:
percentual_success = round(df["state"].value_counts() / len(df["state"]) * 100,2)

print("State Percentual in %: ")
print(percentual_success)

state = round(df["state"].value_counts() / len(df["state"]) * 100,2)

labels = list(state.index)
values = list(state.values)

trace1 = go.Pie(labels=labels, values=values, marker=dict(colors=['red']))

layout = go.Layout(title='Distribuition of States', legend=dict(orientation="h"));

fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
print('done')

In [None]:
# Exploring the distribution logarithm of these values
df_failed = df[df["state"].isin(["failed","canceled","suspended"])] #antonio
df_sucess = df[df["state"] == "successful"]

#First plot
trace0 = go.Histogram(
    x= np.log(df.usd_goal_real + 1).head(100000),
    histnorm='probability', showlegend=False,
    xbins=dict(
        start=-5.0,
        end=19.0,
        size=1),
    autobiny=True)

#Second plot
trace1 = go.Histogram(
    x = np.log(df.usd_pledged_real + 1).head(100000),
    histnorm='probability', showlegend=False,
    xbins=dict(
        start=-1.0,
        end=17.0,
        size=1))

# Add histogram data
x1 = np.log(df_failed['usd_goal_real']+1).head(100000)
x2 = np.log(df_sucess["usd_goal_real"]+1).head(100000)

trace3 = go.Histogram(
    x=x1,
    opacity=0.60, nbinsx=30, name='Goals Failed', histnorm='probability'
)
trace4 = go.Histogram(
    x=x2,
    opacity=0.60, nbinsx=30, name='Goals Successful', histnorm='probability'
)


data = [trace0, trace1, trace3, trace4]
layout = go.Layout(barmode='overlay')

#Creating the grid
fig = tls.make_subplots(rows=2, cols=2, specs=[ [{'colspan': 2}, None], [{}, {}]],
                          #subplot_titles=('Failed and Successful Projects',
                          subplot_titles=('Successful and Non-Successful Projects',
                                          'Goal','Pledged'))

#setting the figs
fig.append_trace(trace0, 2, 1)
fig.append_trace(trace1, 2, 2)
fig.append_trace(trace3, 1, 1)
fig.append_trace(trace4, 1, 1)

fig['layout'].update(title="Distributions",
                     height=500, width=900, barmode='overlay')
iplot(fig)
print('done')

In [None]:
# group instances by state: failed/successful
main_cats = df["main_category"].value_counts()
main_cats_failed = df[df["state"].isin(["failed","canceled","suspended"])]["main_category"].value_counts() #antonio
main_cats_sucess = df[df["state"] == "successful"]["main_category"].value_counts()

# plots
trace0 = go.Bar(
    x=main_cats_failed.index,
    y=main_cats_failed.values,
    name="Failed"
)

trace1 = go.Bar(
    x=main_cats_sucess.index,
    y=main_cats_sucess.values,
    name="Success"
)

trace2 = go.Bar(
    x=main_cats.index,
    y=main_cats.values,
    name="Distribution"
)


fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Non-Successful','Successful', "General Category's"))

#setting the figs
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)

fig['layout'].update(showlegend=True, title="Main Category's Distribuition",bargap=0.05)
iplot(fig)

df['main_category'].value_counts().plot.bar()
plt.show()

df['currency'].value_counts().plot.bar()
plt.show()

df['country'].value_counts().plot.bar()
plt.show()

df['state'].value_counts().plot.bar()
plt.show()
print('done')

# 4. Data Preparation

In [None]:
df = df[df["state"].isin(["failed","canceled","suspended","successful"])].copy() 

print(df.shape)

print("Delete columns 'ID', 'name', 'usd_pledged', 'usd_pledged_real'. name is string. usd_pledged only contains N/A. usd_pledged_real is only set when the target variable is set.")
df = df.drop('ID', 1)
df = df.drop('name', 1)
df = df.drop('usd pledged', 1)
df = df.drop('usd_pledged_real', 1)
df = df.drop('backers', 1)

print(df.shape)


print("Create new column 'duration_days' = 'deadline' - 'launched'")
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])
df['duration_days'] = df['deadline'].subtract(df['launched'])
df['duration_days'] = df['duration_days'].astype('timedelta64[D]')

print('drop columns: launched, deadline and pledged as they are only set once the state is final.')
df = df.drop('launched', 1)
df = df.drop('deadline', 1)
df = df.drop('pledged', 1)

df = df[(df['goal'] <= 100000) & (df['goal'] >= 1000)].copy()
df.shape

df['state'] = df['state'].map({
        'failed': 0,
        'canceled': 0, 
        'suspended':0, 
        'successful': 1         
})

print('use one-hot-codding for category, main_category and currency')
df = pd.get_dummies(df, columns = ['category'])
df = pd.get_dummies(df, columns = ['main_category'])
df = pd.get_dummies(df, columns = ['currency'])
df = pd.get_dummies(df, columns=['country'])

print("Rename 'main_category_Film & Video' to 'main_category_Film' to avoid character encoding issues")
df.rename(columns={"main_category_Film & Video": "main_category_Film"}, inplace=True)
print('done')

In [None]:
print('drop target variable from train/test datasets')
print(df.shape)
df.head()
y = df['state']
print(y.shape)
y.head(5)
df = df.drop('state', 1)

print('Split dataframe into random train and test subsets')
X_train, X_test, Y_train, Y_test = train_test_split(
    df,
    y, 
    test_size = 0.1,
    random_state = 42
)

print('train data shape')
print(X_train.shape, Y_train.shape)
print('test data shape')
print(X_test.shape, Y_test.shape)
print('done')

# 5. Model Development

## Light GBM

In [None]:
# LGBM
clf_lgbm = LGBMClassifier(
        n_estimators=300,
        num_leaves=30,
        colsample_bytree=.8,
        subsample=.8,
        max_depth=10,
        reg_alpha=.1,
        reg_lambda=.05,
        min_split_gain=.005
    )

clf_lgbm.fit(X_train, 
        Y_train,
        eval_set= [(X_train, Y_train), (X_test, Y_test)], 
        eval_metric='auc', 
        verbose=0, 
        early_stopping_rounds=30
       )

acc_clf_lgbm = round(clf_lgbm.score(X_test, Y_test) * 100, 2)
acc_clf_lgbm

## Catboost

In [None]:
# catboost (70.8)
from catboost import CatBoostClassifier
catmodel = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Silent'
)
catmodel.fit(
    X_train, Y_train,
    eval_set=(X_test, Y_test),
    plot=False
);
catmodel = round(catmodel.score(X_test, Y_test) * 100, 2)
catmodel

## Adaboost

In [None]:
# adaboost (69.92)
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8), #1
                         algorithm="SAMME",
                         n_estimators=200)
bdt.fit(X_train, Y_train)
acc_bdt = round(bdt.score(X_test, Y_test) * 100, 2)
acc_bdt

## Logistic Regression

In [None]:
# Logistic Regression (64.7)
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
acc_log = round(logreg.score(X_test, Y_test) * 100, 2)

coeff_df = pd.DataFrame(df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

acc_log

## KNN

In [None]:
# KNN (65.04)
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
acc_knn = round(knn.score(X_test, Y_test) * 100, 2)
acc_knn

## Support Vector Machines

In [None]:
# Linear SVC/SupportVectorMachine (66.11 failed to converge)
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
acc_linear_svc = round(linear_svc.score(X_test, Y_test) * 100, 2)
acc_linear_svc

## Decision Tree

In [None]:
# Decision Tree (66.02)
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
acc_decision_tree = round(decision_tree.score(X_test, Y_test) * 100, 2)
acc_decision_tree

## Random Forest

In [None]:
# random forest (67.13)
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_test, Y_test) * 100, 2)
acc_random_forest

## Gradient Boosting

In [None]:
# gradient boosting (67.16)
clf_gb = GradientBoostingClassifier(n_estimators=100, 
                                 max_depth=1, 
                                 random_state=0)
clf_gb.fit(X_train, Y_train)
acc_clf_gb = round(clf_gb.score(X_test, Y_test) * 100, 2)
acc_clf_gb

## Multi-layer perceptron

In [None]:
# multi-layer perceptron (64.71)
mlp = MLPClassifier(solver='lbfgs', 
                    alpha=1e-5, 
                    hidden_layer_sizes=(21, 2), 
                    random_state=1)
mlp.fit(X_train, Y_train)
acc_mlp = round(mlp.score(X_test, Y_test) * 100, 2)
acc_mlp

## Bagging classifier

In [None]:
# bagging classifier (63.7)
bagging = BaggingClassifier(
    KNeighborsClassifier(
        n_neighbors=8,
        weights='distance'
        ),
    oob_score=True,
    max_samples=0.5,
    max_features=1.0
    )
clf_bag = bagging.fit(X_train,Y_train)
acc_clf_bag = round(clf_bag.score(X_test, Y_test) * 100, 2)
acc_clf_bag

# 6. Principal Component Analysis

In [None]:
# Feature selection and PCA - Simple variance baseline approach

from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2

print('there is a high number of variables in the dataset:')
print(df.shape)

#print("feature selection: remove all features whose variance doesn’t meet some threshold.")
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

print("standardize the remaining features")
dfpca = pd.DataFrame(sel.fit_transform(df))

from sklearn.decomposition import PCA
print('apply PCA, 5 components')
pca = PCA(n_components = 5) # arbitrary number
principalComponents = pca.fit_transform(dfpca)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1','pc2','pc3','pc4','pc5'])

# Split dataframe into random train and test subsets
Xpca_train, Xpca_test, Ypca_train, Ypca_test = train_test_split(
    principalDf,
    y, 
    test_size = 0.1,
    random_state=42
)

print('PCA reduces the number of variables to:')
print(Xpca_train.shape, Ypca_train.shape)
print(Xpca_test.shape, Ypca_test.shape)
print('done')

## Light GBM with PCA

In [None]:
# LGBM with pca 

clfpca_lgbm = LGBMClassifier(
        n_estimators=300,
        num_leaves=30,
        colsample_bytree=.8,
        subsample=.8,
        max_depth=10,
        reg_alpha=.1,
        reg_lambda=.05,
        min_split_gain=.005
    )

clfpca_lgbm.fit(Xpca_train, 
        Ypca_train,
        eval_set= [(Xpca_train, Ypca_train), (Xpca_test, Ypca_test)], 
        eval_metric='auc', 
        verbose=0, 
        early_stopping_rounds=30
       )

clfpca_lgbm = round(clfpca_lgbm.score(Xpca_test, Ypca_test) * 100, 2)
clfpca_lgbm

## Catboost with PCA

In [None]:
# catboost with PCA
from catboost import CatBoostClassifier
catmodelpca = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Silent'
)
catmodelpca.fit(
    Xpca_train, Ypca_train,
    eval_set=(Xpca_test, Ypca_test),
    plot=False
);
catmodelpca = round(catmodelpca.score(Xpca_test, Ypca_test) * 100, 2)
catmodelpca

## Bagging classsifier with PCA

In [None]:
# bagging classifier with PCA
baggingpca = BaggingClassifier(
    KNeighborsClassifier(
        n_neighbors=8,
        weights='distance'
        ),
    oob_score=True,
    max_samples=0.5,
    max_features=1.0
    )
clf_bagpca = baggingpca.fit(Xpca_train,Ypca_train)
acc_clf_bagpca = round(clf_bagpca.score(Xpca_test, Ypca_test) * 100, 2)
acc_clf_bagpca

# 7. Cross-validation using gridsearchcv

## Light GBM

In [None]:
print('LGBM: model selection (best hyperparameters) using gridsearchcv.')

from sklearn.model_selection import GridSearchCV

train_data = lgb.Dataset(X_test, label = Y_test)

lgbm_mdl = LGBMClassifier(
    n_estimators=300,
    num_leaves=30,
    colsample_bytree=.8,
    subsample=.8,
    max_depth=10,
    reg_alpha=.1,
    reg_lambda=.05,
    min_split_gain=.005
)

gridParams = {
    'learning_rate': [0.07, 0.1],
    'n_estimators': [100, 300],
    'num_leaves': [20, 50],
    'random_state' : [501, 42], 
    'colsample_bytree' : [0.6, 0.8],
    'subsample' : [0.6, 0.8],
    'max_depth' : [10, 20]
}
grid = GridSearchCV(lgbm_mdl, gridParams, verbose = 2, cv = 4, n_jobs = -1)
grid.fit(X_test, Y_test)
print(grid.best_params_)
print(grid.best_score_)

params = {
    'colsample_bytree': grid.best_params_['colsample_bytree'],
    'learning_rate': grid.best_params_['learning_rate'],
    'max_depth': grid.best_params_['max_depth'],
    'n_estimators': grid.best_params_['n_estimators'],
    'num_leaves': grid.best_params_['num_leaves'],
    'random_state': grid.best_params_['random_state'],
    'subsample': grid.best_params_['subsample'],
    'reg_alpha': .1,
    'reg_lambda': .05,
    'min_split_gain': .005,
    'colsample_bytree': .8
}

print('train model using the best hyperparameters')
lgbm = lgb.train(params, train_data, verbose_eval = 4)

print('predict using the test set')
predictions_lgbm_prob = lgbm.predict(X_test) # X_train
predictions_lgbm_01 = np.where(predictions_lgbm_prob > 0.5, 1, 0)

acc_lgbm_v2 = round(accuracy_score(Y_test, predictions_lgbm_01)  * 100, 2) # Y_train
print(acc_lgbm_v2)
print('done')

In [None]:
print('Feature importance')
lgb.plot_importance(lgbm, max_num_features=21, importance_type='split')

# 9. Model Evaluation

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 
              'Logistic Regression', 
              'Random Forest',   
              'Linear SVC (SVM)', 
              'Decision Tree', 
              'BaggingClassifier',
              'AdaBoostClassifier', 
              'GradientBoostingClassifier',
              'LGBMClassifier',
              'CatBoost',
              'LGBMClassifier PCA',
              'CatBoost PCA',
              'Bagging PCA',
              'Light GBM with gridsearchcv'
             ],
    'Score': [acc_knn, 
              acc_log, 
              acc_random_forest,   
              acc_linear_svc, 
              acc_decision_tree,
              acc_clf_bag, 
              acc_bdt, 
              acc_clf_gb, 
              acc_clf_lgbm,
              catmodel,
              clfpca_lgbm,
              catmodelpca,
              acc_clf_bagpca,
              acc_lgbm_v2
             ]})
models.sort_values(by='Score', ascending=False)