In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from datetime import datetime
def countDays(row):
    date_format = "%Y-%m-%d"
    a = datetime.strptime(row["launched"][:10], date_format)
    b = datetime.strptime(row["deadline"], date_format)
    delta = b - a
    return delta.days
    

In [None]:
def evaluate(H, Y, beta=1.0):
    tp = sum((Y == H) * (Y == 1) * 1)
    tn = sum((Y == H) * (Y == 0) * 1)
    fp = sum((Y != H) * (Y == 0) * 1)
    fn = sum((Y != H) * (Y == 1) * 1)
    
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (fp + tn)
    precision = tp / (tp + fp)
    recall = sensitivity
    f_score = ( (beta**2 + 1) * precision * recall) / (beta**2 * precision + recall)
    auc = (sensitivity + specificity) / 2
    youden = sensitivity - (1 - specificity)
    p_plus = sensitivity / (1 - specificity)
    p_minus = (1 - sensitivity) / specificity
    dp = (np.sqrt(3) / np.pi) * (np.log(sensitivity/(1 - sensitivity) + np.log(specificity/(1 - specificity))))
    
    result = {}
    result["tp"] = tp
    result["tn"] = tn
    result["fp"] = fp
    result["fn"] = fn
    result["accuracy"] = accuracy
    result["sensitivity"] = sensitivity
    result["specificity"] = specificity
    result["precision"] = precision
    result["recall"] = recall
    result["f-score"] = f_score
    result["AUC"] = auc
    result["Youden"] = youden
    result["p+"] = p_plus
    result["p-"] = p_minus
    result["DP"] = dp
    
    return result

In [None]:
df = pd.read_csv('ks-projects-201801.csv')
data = df[(df.state == 'successful') | (df.state == 'failed') ]
data["days"] = data.apply(countDays, axis=1)

In [5]:
cleanData = data.drop(['ID', 'name','category','deadline','launched','pledged','usd pledged','goal', 'backers', 'usd_pledged_real'], 1)
dataBinary = cleanData.copy()
dataBinary['state'] = np.where(dataBinary.state=='successful', 1, 0)

In [6]:
one_hot = pd.get_dummies(dataBinary['main_category'])
dataBinary = dataBinary.join(one_hot)

one_hot = pd.get_dummies(dataBinary['currency'])
dataBinary = dataBinary.join(one_hot)

one_hot = pd.get_dummies(dataBinary['country'])
dataBinary = dataBinary.join(one_hot)


In [7]:
cleanDataBinary = dataBinary.drop(['main_category', 'currency','country'], 1)

In [8]:
OneHotData = cleanDataBinary.copy()
X_set = OneHotData.drop(['state'], 1)
y_set = OneHotData['state']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_set, y_set, test_size=0.2, random_state=1)

In [10]:
def printClassifierName(model):
    print(type(model).__name__)

In [11]:
from time import time
def runClassifier(clf, X_train, y_train,X_test, y_test):
    print(f"** {printClassifierName(clf)}")
    t0 = time()
    clf.fit(X_train, y_train)
    t1 = time()
    print(f"\tTraining time:\t\t{t1-t0:3.3f}")
    score_train = clf.score(X_train[0:50000], y_train[0:50000])
    t2 = time()
    print(f"\tPrediction time(train):\t{t2-t1:3.3f}")
    score_test = clf.score(X_test, y_test)
    t3 = time()
    print(f"\tPrediction time(test):\t{t3-t2:3.3f}")
    print(f"\tScore Train: {score_train:.3f}\tScore Test: {score_test:.3f}")

In [12]:
from sklearn.linear_model import LogisticRegression
LogisticRegression = LogisticRegression()
runClassifier(LogisticRegression, X_train, y_train, X_test, y_test)

LogisticRegression
** None




	Training time:		1.075
	Prediction time(train):	0.031
	Prediction time(test):	0.047
	Score Train: 0.598	Score Test: 0.595


In [None]:
from sklearn.neural_network import MLPClassifier
hidden_layer_size = [(100,100,100),(200,200,200),(500,500,500)]
for hls in hidden_layer_size:
    print("hidden_layer_sizes:" + str(hls))
    cls = MLPClassifier(hidden_layer_sizes=hls)
    runClassifier(cls, X_train, y_train,X_test, y_test)

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)

In [None]:
from sklearn import preprocessing
def convertStringsToInt(X_set):
    for column in X_set.columns:
        if X_set[column].dtype == type(object):
            le = preprocessing.LabelEncoder()
            X_set[column] = le.fit_transform(X_set[column])

In [None]:
dataBinary2 = cleanData.copy()
dataBinary2['state'] = np.where(dataBinary2.state=='successful', 1, 0)
y_set2= dataBinary2['state']
X_set2 = dataBinary2.drop(['state'], 1)
convertStringsToInt(X_set2)
X_train, X_test, y_train, y_test = train_test_split(X_set2, y_set2, test_size=0.2, random_state=1)

In [None]:
hidden_layer_size = [(100,100,100)]#,(200,200,200),(500,500,500)]
for hls in hidden_layer_size:
    print("hidden_layer_sizes:" + str(hls))
    cls = MLPClassifier(hidden_layer_sizes=hls)
    runClassifier(cls, X_train, y_train,X_test, y_test)

In [None]:
mlp = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)

In [None]:
type(clf)

In [None]:
sorted(clf.cv_results_)

In [18]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
runClassifier(mlp,X_train, y_train, X_test, y_test)

MLPClassifier
** None
	Training time:		751.966
	Prediction time(train):	0.348
	Prediction time(test):	0.405
	Score Train: 0.648	Score Test: 0.651


In [21]:
result = evaluate(mlp.predict(X_test), y_test)
print(result['Youden'])

0.2209184550206671


  .format(op=op_str, alt_op=unsupported[op_str]))


In [13]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
runClassifier(logReg,X_train, y_train, X_test, y_test)

LogisticRegression
** None




	Training time:		1.185
	Prediction time(train):	0.047
	Prediction time(test):	0.023
	Score Train: 0.598	Score Test: 0.595


In [14]:
from sklearn.neighbors import KNeighborsClassifier
KNClassifier = KNeighborsClassifier()
runClassifier(KNClassifier,X_train, y_train, X_test, y_test)

KNeighborsClassifier
** None
	Training time:		6.764
	Prediction time(train):	13.009
	Prediction time(test):	16.964
	Score Train: 0.685	Score Test: 0.612


In [19]:
result = evaluate(KNClassifier.predict(X_test), y_test)
print(result['Youden'])

0.18071371544578008


  .format(op=op_str, alt_op=unsupported[op_str]))


In [15]:
runClassifier(logReg,X_train, y_train, X_test, y_test)

LogisticRegression
** None




	Training time:		1.166
	Prediction time(train):	0.043
	Prediction time(test):	0.053
	Score Train: 0.598	Score Test: 0.595


In [16]:
from sklearn.ensemble import RandomForestClassifier
randFor = RandomForestClassifier(max_depth=2, random_state=0)
runClassifier(randFor,X_train, y_train, X_test, y_test)

RandomForestClassifier
** None




	Training time:		1.088
	Prediction time(train):	0.090
	Prediction time(test):	0.172
	Score Train: 0.602	Score Test: 0.599


In [20]:
result = evaluate(randFor.predict(X_test), y_test)
print(result['Youden'])

0.010521852764188414


  .format(op=op_str, alt_op=unsupported[op_str]))


In [None]:
randFor = RandomForestClassifier(max_depth=2, random_state=0)
runClassifier(randFor,X_train, y_train, X_test, y_test)
randFor.predict_proba(X_test)

In [None]:
round(0.52342,1)

In [23]:
classfiers = []

In [24]:
from sklearn.neural_network import MLPClassifier
hidden_layer_size = [(50,),(75,),(100,)]#, (25,25), (25,50), (50,50), (50,75), (25,75), (25,100), (100, 25)]
for hls in hidden_layer_size:
    print("hidden_layer_sizes:" + str(hls))
    cls = MLPClassifier(activation='relu', solver='adam', hidden_layer_sizes= hls)
    runClassifier(cls, X_train, y_train,X_test, y_test)
    classfiers.append(cls)

hidden_layer_sizes:(50,)
MLPClassifier
** None
	Training time:		217.220
	Prediction time(train):	0.163
	Prediction time(test):	0.247
	Score Train: 0.539	Score Test: 0.540
hidden_layer_sizes:(75,)
MLPClassifier
** None
	Training time:		456.205
	Prediction time(train):	0.263
	Prediction time(test):	0.339
	Score Train: 0.602	Score Test: 0.599
hidden_layer_sizes:(100,)
MLPClassifier
** None




	Training time:		800.386
	Prediction time(train):	0.350
	Prediction time(test):	0.418
	Score Train: 0.602	Score Test: 0.599


In [None]:
for clf in classfiers:
    result = evaluate(clf.predict(X_test), y_test)
    print(result['Youden'])