In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from datetime import datetime
def countDays(row):
    date_format = "%Y-%m-%d"
    a = datetime.strptime(row["launched"][:10], date_format)
    b = datetime.strptime(row["deadline"], date_format)
    delta = b - a
    return delta.days
    

In [20]:
def evaluate(H, Y, beta=1.0):
    tp = sum((Y == H) * (Y == 1) * 1)
    tn = sum((Y == H) * (Y == 0) * 1)
    fp = sum((Y != H) * (Y == 0) * 1)
    fn = sum((Y != H) * (Y == 1) * 1)
    
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (fp + tn)
    precision = tp / (tp + fp)
    recall = sensitivity
    f_score = ( (beta**2 + 1) * precision * recall) / (beta**2 * precision + recall)
    auc = (sensitivity + specificity) / 2
    youden = sensitivity - (1 - specificity)
    p_plus = sensitivity / (1 - specificity)
    p_minus = (1 - sensitivity) / specificity
    dp = (np.sqrt(3) / np.pi) * (np.log(sensitivity/(1 - sensitivity) + np.log(specificity/(1 - specificity))))
    
    result = {}
    result["tp"] = tp
    result["tn"] = tn
    result["fp"] = fp
    result["fn"] = fn
    result["accuracy"] = accuracy
    result["sensitivity"] = sensitivity
    result["specificity"] = specificity
    result["precision"] = precision
    result["recall"] = recall
    result["f-score"] = f_score
    result["AUC"] = auc
    result["Youden"] = youden
    result["p+"] = p_plus
    result["p-"] = p_minus
    result["DP"] = dp
    
    return result

In [7]:
df = pd.read_csv('ks-projects-201801.csv')
data = df[(df.state == 'successful') | (df.state == 'failed') ]
data["days"] = data.apply(countDays, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
cleanData = data.drop(['ID', 'name','category','deadline','launched','pledged','usd pledged','goal', 'backers', 'usd_pledged_real'], 1)
dataBinary = cleanData.copy()
dataBinary['state'] = np.where(dataBinary.state=='successful', 1, 0)

In [12]:
one_hot = pd.get_dummies(dataBinary['main_category'])
dataBinary = dataBinary.join(one_hot)

one_hot = pd.get_dummies(dataBinary['currency'])
dataBinary = dataBinary.join(one_hot)

one_hot = pd.get_dummies(dataBinary['country'])
dataBinary = dataBinary.join(one_hot)


In [15]:
cleanDataBinary = dataBinary.drop(['main_category', 'currency','country'], 1)

In [16]:
OneHotData = cleanDataBinary.copy()
X_set = OneHotData.drop(['state'], 1)
y_set = OneHotData['state']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_set, y_set, test_size=0.2, random_state=1)

In [21]:
def printClassifierName(model):
    print(type(model).__name__)

In [26]:
from time import time
def runClassifier(clf, X_train, y_train,X_test, y_test):
    print(f"** {printClassifierName(clf)}")
    t0 = time()
    clf.fit(X_train, y_train)
    t1 = time()
    print(f"\tTraining time:\t\t{t1-t0:3.3f}")
    score_train = clf.score(X_train[0:50000], Y_train[0:50000])
    t2 = time()
    print(f"\tPrediction time(train):\t{t2-t1:3.3f}")
    score_test = clf.score(X_test, y_test)
    t3 = time()
    print(f"\tPrediction time(test):\t{t3-t2:3.3f}")
    print(f"\tScore Train: {score_train:.3f}\tScore Test: {score_test:.3f}")

In [27]:
from sklearn.linear_model import LogisticRegression
LogisticRegression = LogisticRegression()
runClassifier(LogisticRegression, X_train, y_train, X_test, y_test)

** <function printClassifierName at 0x00000232A5861E18>




	Training time:		1.200


NameError: name 'Y_train' is not defined