In [1]:
import pandas as pd
import seaborn as sns
import numpy as np, operator
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 32, 16

from scipy.stats import norm
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('KS_test_data.csv', delimiter=',')
df_train = pd.read_csv('KS_train_data.csv', delimiter=',')
df_train = df_train.dropna()

# Length of name feature

In [3]:
#length of name
temp = []
for i in df_train.name:
    a = len(i)
    b = np.log(a)
    temp.append(b)
se = pd.Series(temp)
df_train['length_name'] = se.values

In [4]:
#length of name
temp = []
for i in df.name:
    a = len(i)
    b = np.log(a)
    temp.append(b)
se = pd.Series(temp)
df['length_name'] = se.values

# Daily Requirements feature 

In [5]:
#days between deadline and launched_at
temp = []
for i, j in zip(df_train.deadline, df_train.launched_at):
    unix_timestamp = df_train.launched_at
    utc_time = datetime.fromtimestamp(i)
    unix_timestamp1 = df_train.deadline
    utc_time1 = datetime.fromtimestamp(j)
    temp.append((utc_time - utc_time1).days)
se = pd.Series(temp)
df_train['deadline_launched_at'] = se.values

In [6]:
#days between deadline and launched_at
temp = []
for i, j in zip(df.deadline, df.launched_at):
    unix_timestamp = df.launched_at
    utc_time = datetime.fromtimestamp(i)
    unix_timestamp1 = df.deadline
    utc_time1 = datetime.fromtimestamp(j)
    temp.append((utc_time - utc_time1).days)
se = pd.Series(temp)
df['deadline_launched_at'] = se.values

In [7]:
df_daily_requirement = pd.DataFrame()
df_daily_requirement['goal'] = df_train['goal']
df_daily_requirement['deadline_launched_at'] = df_train['deadline_launched_at']
df_daily_requirement['result'] = df_daily_requirement['goal']/df_daily_requirement['deadline_launched_at']
df_train['log_daily_requirement'] = np.log(df_daily_requirement['result'])

In [8]:
df_daily_requirement = pd.DataFrame()
df_daily_requirement['goal'] = df['goal']
df_daily_requirement['deadline_launched_at'] = df['deadline_launched_at']
df_daily_requirement['result'] = df_daily_requirement['goal']/df_daily_requirement['deadline_launched_at']
df['log_daily_requirement'] = np.log(df_daily_requirement['result'])

In [9]:
print(df.columns)
print(df_train.columns)

Index(['project_id', 'backers_count', 'blurb', 'category', 'country',
       'created_at', 'currency', 'deadline', 'fx_rate', 'goal', 'launched_at',
       'name', 'staff_pick', 'location', 'subcategory', 'project_url',
       'reward_url', 'length_name', 'deadline_launched_at',
       'log_daily_requirement'],
      dtype='object')
Index(['project_id', 'backers_count', 'blurb', 'category',
       'converted_pledged_amount', 'country', 'created_at', 'currency',
       'deadline', 'fx_rate', 'goal', 'launched_at', 'name', 'pledged',
       'staff_pick', 'usd_pledged', 'location', 'funded', 'subcategory',
       'project_url', 'reward_url', 'length_name', 'deadline_launched_at',
       'log_daily_requirement'],
      dtype='object')


# Funded on words feature

In [10]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")

words_funded = {}

for i in df_train[df_train['funded'] == True]['blurb']:
    words = i.split()
    for word in words:
        if word.lower() in cachedStopWords:
            pass
        else:
            if word in words_funded:
                words_funded[word] += 1
            else:
                words_funded[word] = 1
                
words_not_funded = {}

for i in df_train[df_train['funded'] == False]['blurb']:
    words = i.split()
    for word in words:
        try:
            if word.lower() in cachedStopWords:
                pass
            else:
                if word in words_not_funded:
                    words_not_funded[word] += 1
                else:
                    words_not_funded[word] = 1
        except:
            pass
#for key, value in sorted(words_funded.iteritems(), key=lambda (k,v): (v,k), reverse=True):
#    print "%s: %s" % (key, value)
#for key, value in sorted(words_funded.iteritems(), key=lambda (k,v): (v,k), reverse=True):
#    print "%s: %s" % (key, value)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\woute\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
words_funded = {k: v for k, v in words_funded.items() if v > 10}
ratio_funded_unfunded = (float(len(df_train[df_train['funded'] == True]['funded']))) / (float(len(df_train[df_train['funded'] == False]['funded'])))

In [12]:
list_ratio = []
for i in df['blurb']:
    try:
        i = i.split()
    except:
        #assume it will fail if blurb is null (in a sense)
        list_ratio.append(0.01)
        continue
    count = 0
    ratio_number = 0 
    for word in i:
        if word in words_funded and word in words_not_funded:
                count += 1
                ratio_number += (float(words_funded[word]) /float(ratio_funded_unfunded)) / float(words_not_funded[word])
        else:
            count += 1
            ratio_number += 1
    ratio_number = ratio_number / count
    list_ratio.append(ratio_number)

In [13]:
list_ratio = []
for i in df_train['blurb']:
    try:
        i = i.split()
    except:
        #assume it will fail if blurb is null (in a sense)
        list_ratio.append(0.01)
        continue
    count = 0
    ratio_number = 0 
    for word in i:
        if word in words_funded and word in words_not_funded:
                count += 1
                ratio_number += (float(words_funded[word]) /float(ratio_funded_unfunded)) / float(words_not_funded[word])
        else:
            count += 1
            ratio_number += 1
    ratio_number = ratio_number / count
    list_ratio.append(ratio_number)

In [14]:
df_train['funded_on_words'] = list_ratio

In [15]:
list_ratio = []
for i in df['blurb']:
    try:
        i = i.split()
    except:
        #assume it will fail if blurb is null (in a sense)
        list_ratio.append(0.01)
        continue
    count = 0
    ratio_number = 0 
    for word in i:
        if word in words_funded and word in words_not_funded:
                count += 1
                ratio_number += (float(words_funded[word]) /float(ratio_funded_unfunded)) / float(words_not_funded[word])
        else:
            count += 1
            ratio_number += 1
    ratio_number = ratio_number / count
    list_ratio.append(ratio_number)

In [16]:
df['funded_on_words'] = list_ratio

In [17]:
print(df_train.columns)
print(df.columns)

Index(['project_id', 'backers_count', 'blurb', 'category',
       'converted_pledged_amount', 'country', 'created_at', 'currency',
       'deadline', 'fx_rate', 'goal', 'launched_at', 'name', 'pledged',
       'staff_pick', 'usd_pledged', 'location', 'funded', 'subcategory',
       'project_url', 'reward_url', 'length_name', 'deadline_launched_at',
       'log_daily_requirement', 'funded_on_words'],
      dtype='object')
Index(['project_id', 'backers_count', 'blurb', 'category', 'country',
       'created_at', 'currency', 'deadline', 'fx_rate', 'goal', 'launched_at',
       'name', 'staff_pick', 'location', 'subcategory', 'project_url',
       'reward_url', 'length_name', 'deadline_launched_at',
       'log_daily_requirement', 'funded_on_words'],
      dtype='object')


# Splitting Data

In [18]:
features = ['staff_pick', 'funded_on_words', 'length_name', 'log_daily_requirement']

X = df_train[features]
y = df_train['funded']

In [19]:

from sklearn.metrics import classification_report, confusion_matrix  
from sklearn import metrics
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
kf.get_n_splits(X)

sum_train_score = 0
sum_test_score = 0
model_list = []
score_list_train = []
score_list_test = []


KFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    sum_train_score += model.score(X_train, y_train)
    sum_test_score += model.score(X_test, y_test)
    #print(model.score(X_train, y_train))
    #print(model.score(X_test, y_test))
    prediction_train = model.predict(X_train)
    prediction_test = model.predict(X_test)
    #print(model.predict(X_train))
    #print(model.predict(X_test))
    model_list.append(model)
    score_list_train.append(model.score(X_train, y_train))
    score_list_test.append(model.score(X_test, y_test))
    
    
for i,model in enumerate(model_list):
    print("model {} scores: train {}\ttest {}".format(i, score_list_train[i], score_list_test[i]))
    
print("avg_train", sum_train_score/10)
print("avg_test", sum_test_score/10)


TRAIN: [10000 10001 10002 ... 99992 99993 99994] TEST: [   0    1    2 ... 9997 9998 9999]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [10000 10001 10002 ... 19997 19998 19999]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [20000 20001 20002 ... 29997 29998 29999]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [30000 30001 30002 ... 39997 39998 39999]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [40000 40001 40002 ... 49997 49998 49999]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [50000 50001 50002 ... 59996 59997 59998]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [59999 60000 60001 ... 69995 69996 69997]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [69998 69999 70000 ... 79994 79995 79996]
TRAIN: [    0     1     2 ... 99992 99993 99994] TEST: [79997 79998 79999 ... 89993 89994 89995]
TRAIN: [    0     1     2 ... 89993 89994 89995] TEST: [89996 89997 89998 ... 99992 99993 99994]
model 0 scores: train 0.748441580087

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Logistic regression

In [21]:
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

prediction_train = model.predict(X_train)
print(prediction_train)
prediction_test = model.predict(X_test)
print(prediction_test)

confusion_matrix_train = metrics.confusion_matrix(prediction_train, y_train)
confusion_matrix_test = metrics.confusion_matrix(prediction_test, y_test)

print("confusion matrix train\n", confusion_matrix_train)
print("confusion matrix  test\n", confusion_matrix_test)

(79996, 4) (79996,)
(19999, 4) (19999,)
0.7458997949897495
0.7430871543577179
[ True False  True ... False  True False]
[False  True False ...  True  True  True]
confusion matrix train
 [[22564  9799]
 [10528 37105]]
confusion matrix  test
 [[5739 2426]
 [2712 9122]]


### Save predictions logistic regression

In [22]:
list_predictions = model.predict(df[features])

In [23]:
se = pd.Series(list_predictions)
df['funded_prediction'] = se.values

In [24]:
df.to_csv('predictions.tsv', sep='\t', columns=['project_id','funded_prediction'])

### Confusion matrix logistic regression

In [25]:
#train
TN_train = confusion_matrix_train[0][0]
FN_train = confusion_matrix_train[1][0]
TP_train = confusion_matrix_train[1][1]
FP_train = confusion_matrix_train[0][1]

total_train = confusion_matrix_train.sum()
actual_train_yes = TP_train + FN_train
actual_train_no = TN_train + FP_train
predicted_train_yes = FP_train + TP_train

#test
TN_test = confusion_matrix_test[0][0]
FN_test = confusion_matrix_test[1][0]
TP_test = confusion_matrix_test[1][1]
FP_test = confusion_matrix_test[0][1]

total_test = confusion_matrix_test.sum()
actual_test_yes = TP_test + FN_test
actual_test_no = TN_test + FP_test
predicted_test_yes = FP_test + TP_test

In [26]:
#train
print("Accuracy: Overall, how often is the classifier correct?", ((TP_train+TN_train)/total_train))
print("Misclassification Rate: Overall, how often is it wrong?", ((FP_train+FN_train)/total_train))
print("True Positive Rate: When it's actually yes, how often does it predict yes?", (TP_train/actual_train_yes))
print("False Positive Rate: When it's actually no, how often does it predict yes?", (FP_train/actual_train_no))
print("Specificity: When it's actually no, how often does it predict no?", (TN_train/actual_train_no))
print("Precision: When it predicts yes, how often is it correct?", (TP_train/predicted_train_yes))
print("Prevalence: How often does the yes condition actually occur in our sample?", actual_train_yes/total_train)
print("\n")
#test
print("Accuracy: Overall, how often is the classifier correct?", ((TP_test+TN_test)/total_test))
print("Misclassification Rate: Overall, how often is it wrong?", ((FP_test+FN_test)/total_test))
print("True Positive Rate: When it's actually yes, how often does it predict yes?", (TP_test/actual_test_yes))
print("False Positive Rate: When it's actually no, how often does it predict yes?", (FP_test/actual_test_no))
print("Specificity: When it's actually no, how often does it predict no?", (TN_test/actual_test_no))
specificity = TN_test/actual_test_no
print("Precision: When it predicts yes, how often is it correct?", (TP_test/predicted_test_yes))
precision = (TP_test/predicted_test_yes)
print("Prevalence: How often does the yes condition actually occur in our sample?", actual_test_yes/total_test)

Accuracy: Overall, how often is the classifier correct? 0.7458997949897495
Misclassification Rate: Overall, how often is it wrong? 0.2541002050102505
True Positive Rate: When it's actually yes, how often does it predict yes? 0.7789767598093759
False Positive Rate: When it's actually no, how often does it predict yes? 0.3027840435064734
Specificity: When it's actually no, how often does it predict no? 0.6972159564935265
Precision: When it predicts yes, how often is it correct? 0.791083916083916
Prevalence: How often does the yes condition actually occur in our sample? 0.5954422721136057


Accuracy: Overall, how often is the classifier correct? 0.7430871543577179
Misclassification Rate: Overall, how often is it wrong? 0.2569128456422821
True Positive Rate: When it's actually yes, how often does it predict yes? 0.770829812404935
False Positive Rate: When it's actually no, how often does it predict yes? 0.29712186160440907
Specificity: When it's actually no, how often does it predict no? 0

### Estimation logistic regression

In [27]:
total_true_predictions = df[df['funded_prediction'] == True]['funded_prediction'].count()
print(total_true_predictions)
total_false_predictions = df[df['funded_prediction'] == False]['funded_prediction'].count()
print(total_false_predictions)

specificity_estimate = (specificity * total_false_predictions)
print("Specificity: When it's actually no, how often does it predict no? ", specificity_estimate, "/", total_false_predictions)
precision_estimate = (precision * total_true_predictions)
print("Precision: When it predicts yes, how often is it correct? ", precision_estimate, "/", total_true_predictions )


estimate = specificity_estimate + precision_estimate
print("estimate: ", int(round(estimate)))

46859
31206
Specificity: When it's actually no, how often does it predict no?  21934.015186772813 / 31206
Precision: When it predicts yes, how often is it correct?  37014.876861794255 / 46859
estimate:  58949


# Predicting funded feature

In [28]:
list_predictions = model.predict(df[features])

In [29]:
se = pd.Series(list_predictions)
df['funded_prediction'] = se.values

In [30]:
df.funded_prediction.describe()

count     78065
unique        2
top        True
freq      46859
Name: funded_prediction, dtype: object

In [31]:
ratio_funded_unfunded_comparison = (float(len(df[df['funded_prediction'] == True]['funded_prediction']))) / (float(len(df[df['funded_prediction'] == False]['funded_prediction'])))


In [32]:
ratio_funded_unfunded_comparison

1.5016022559764148

# KNN WITH EVOLUTIONARY COMPUTING

In [33]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
def k_near_algorithm(one_population):
    neigh = KNeighborsClassifier(n_neighbors=one_population[0], algorithm=one_population[2], metric=one_population[3], weights=one_population[1])
    neigh.fit(X_train, y_train)
    score = neigh.score(X_test, y_test)
    return score

In [35]:
def rankParameters(population):
    global fitnessResults
    fitnessResults = {}

    for i in range(len(population)):
        fitnessResults[i] = k_near_algorithm(population[i])
    return sorted(fitnessResults.items(), key = operator.itemgetter(1), reverse = True)
    

In [36]:
def selection(popRanked, eliteSize):
    selectionResults = []
    df = pd.DataFrame(np.array(popRanked), columns=["Index", "Fitness"])
    df['cum_sum'] = df.Fitness.cumsum()
    df['cum_perc'] = 100 * df.cum_sum / df.Fitness.sum()

    for i in range(0, eliteSize):
        selectionResults.append(popRanked[i][0])
    for i in range(0, len(popRanked) - eliteSize):
        pick = 100 * random.random()
        for i in range(0, len(popRanked)):
            if pick <= df.iat[i, 3]:
                selectionResults.append(popRanked[i][0])
                break
    return selectionResults

In [37]:
def matingPool(population, selectionResults):
    matingpool = []
    for i in range(0, len(selectionResults)):
        index = selectionResults[i]
        matingpool.append(population[index])
    return matingpool

In [38]:
def breed(parent1, parent2):
    child = []
    
    
    first_gene = [parent1[0], parent2[0]]
    second_gene = [parent1[1], parent2[1]]
    third_gene = [parent1[2], parent2[2]]
    fourth_gene = [parent1[3], parent2[3]]

    child.append(random.sample(first_gene, 1)[0])
    child.append(random.sample(second_gene, 1)[0])
    child.append(random.sample(third_gene, 1)[0])
    child.append(random.sample(fourth_gene, 1)[0])
    
    
    return child

In [39]:
def breedPopulation(matingpool, eliteSize):
    children = []
    length = len(matingpool) - eliteSize
    pool = random.sample(matingpool, len(matingpool))

    for i in range(0, eliteSize):
        children.append(matingpool[i])

    for i in range(0, length):
        child = breed(pool[i], pool[len(matingpool) - i - 1])
        children.append(child)
    return children

In [40]:
def mutate(individual, mutationRate, population):
    mutated_child = []
    for i in range(len(population)):
        if (random.random() < mutationRate):
            element_changed = int(random.random() * len(individual))
            random_sample = random.sample(population, 1)[0]

            if element_changed == 0:
                mutated_child.append(int(random.random() * 100 + 1))
                mutated_child.append(individual[1])
                mutated_child.append(individual[2])
                mutated_child.append(individual[3])
            elif element_changed == 1:
                mutated_child.append(individual[0])
                mutated_child.append(random_sample[element_changed])
                mutated_child.append(individual[2])
                mutated_child.append(individual[3])
            elif element_changed == 2:
                mutated_child.append(individual[0])
                mutated_child.append(individual[1])
                mutated_child.append(random_sample[element_changed])
                mutated_child.append(individual[3])
            else:
                mutated_child.append(individual[0])
                mutated_child.append(individual[1])
                mutated_child.append(individual[2])
                mutated_child.append(random_sample[element_changed])
        break
        
    if not mutated_child:
        #print('individual ', individual)
        return individual
    else:
        #print('mutated_child ', mutated_child)
        return mutated_child

In [41]:
def mutatePopulation(population, mutationRate):
    mutatedPop = []
    
    for i in range(0, len(population)):
        mutated = mutate(population[i], mutationRate, population)
        mutatedPop.append(mutated)
    return mutatedPop

In [42]:
def nextGeneration(currentGen, eliteSize, mutationRate):
    popRanked = rankParameters(currentGen) #popRanked is list of 25 tuples with sorted best fit with each fit corresponding index value
    selectionResults = selection(popRanked, eliteSize) # A list of sorted index values of best fit, based on preference
    matingpool = matingPool(currentGen, selectionResults) #currentGen is the original data --> parameter_list,                                   
                                                          #so mating pool is ordered list of parameters with best fit value
    children = breedPopulation(matingpool, eliteSize)
    nextGeneration = mutatePopulation(children, mutationRate)
    
    return nextGeneration

In [43]:
def geneticAlgorithm(population, eliteSize, mutationRate, generations):
    pop = population
    print ('Initial score: ', str(rankParameters(population)[0][1]))
    for i in range(0, generations):
        pop = nextGeneration(pop, eliteSize, mutationRate)

    print("Final score: " + str(rankParameters(pop)[0][1]))
    bestParameterIndex = rankParameters(pop)[0][0]
    bestParameter = pop[bestParameterIndex]
    print(bestParameter)

In [44]:
def geneticAlgorithmPlot(population, eliteSize, mutationRate, generations):
    pop = population
    progress = []
    progress.append((rankParameters(population)[0][1]))

    for i in range(0, generations):
        pop = nextGeneration(pop, eliteSize, mutationRate)
        progress.append((rankParameters(population)[0][1]))
        print (i)

    plt.plot(progress)
    plt.ylabel('fitness')
    plt.xlabel('Generation')
    plt.show()

In [45]:
# CREATE POPULATION
populationSize = 20
import random

n_neighbours = range(3, 100)
weights = ['uniform']
algorithm = ['ball_tree', 'kd_tree', 'auto']
metric = ['euclidean', 'manhattan', 'chebyshev']

parameter_list = []

while ((len(parameter_list) < populationSize)):
    for i in range(populationSize):
        new_list = []
        new_list.append(random.sample(n_neighbours, 1)[0])
        new_list.append(random.sample(weights, 1)[0])
        new_list.append(random.sample(algorithm, 1)[0])
        new_list.append(random.sample(metric, 1)[0])
        neigh = KNeighborsClassifier(n_neighbors=new_list[0], algorithm=new_list[2], metric=new_list[3], weights=new_list[1])
        neigh.fit(X_train, y_train)
        try:
            if neigh.score(X_test, y_test) < 0.95:
                parameter_list.append(new_list)
        except:
            print('Not enough memory')
        
parameter_list = parameter_list[:populationSize]

### Running evolutionary computing model

In [46]:
geneticAlgorithm(population=parameter_list, eliteSize=2, mutationRate=0.1, generations=10)

Initial score:  0.744037201860093
Final score: 0.744037201860093
[86, 'uniform', 'kd_tree', 'manhattan']


### Running KNN model with evolutionary computing parameters outputs

In [47]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
neigh = KNeighborsClassifier(n_neighbors=38, algorithm="auto", metric="manhattan", weights="uniform")
neigh.fit(X_train, y_train)
print("train score", neigh.score(X_train, y_train))
print("test  score", neigh.score(X_test, y_test))

train score 0.7550752537626881
test  score 0.7489374468723436


### Save predictions knn + evolutionary computing

In [48]:
list_predictions = neigh.predict(df[features])
se = pd.Series(list_predictions)
df['funded_prediction_knn'] = se.values

In [49]:
df.to_csv('predictions_knn+evolutionary_computing.tsv', sep='\t', columns=['project_id','funded_prediction_knn'])

### Confusion matric KNN + evolutionary computing

In [50]:
prediction_train = neigh.predict(X_train)
prediction_test = neigh.predict(X_test)
print("accuracy train", metrics.accuracy_score(prediction_train, y_train))
print("accuracy  test", metrics.accuracy_score(prediction_test, y_test))
confusion_matrix_train_knn = metrics.confusion_matrix(prediction_train, y_train)
confusion_matrix_test_knn = metrics.confusion_matrix(prediction_test, y_test)
print("confusion matrix train\n", confusion_matrix_train_knn)
print("confusion matrix  test\n", confusion_matrix_test_knn)

#train
TN_train_knn = confusion_matrix_train_knn[0][0]
FN_train_knn = confusion_matrix_train_knn[1][0]
TP_train_knn = confusion_matrix_train_knn[1][1]
FP_train_knn = confusion_matrix_train_knn[0][1]

total_train_knn = confusion_matrix_train.sum()
actual_train_yes_knn = TP_train_knn + FN_train_knn
actual_train_no_knn = TN_train_knn + FP_train_knn
predicted_train_yes_knn = FP_train_knn + TP_train_knn

#test
TN_test_knn = confusion_matrix_test_knn[0][0]
FN_test_knn = confusion_matrix_test_knn[1][0]
TP_test_knn = confusion_matrix_test_knn[1][1]
FP_test_knn = confusion_matrix_test_knn[0][1]

total_test_knn = confusion_matrix_test.sum()
actual_test_yes_knn = TP_test_knn + FN_test_knn
actual_test_no_knn = TN_test_knn + FP_test_knn
predicted_test_yes_knn = FP_test_knn + TP_test_knn

accuracy train 0.7550752537626881
accuracy  test 0.7489374468723436
confusion matrix train
 [[23163  9464]
 [10129 37240]]
confusion matrix  test
 [[5655 2425]
 [2596 9323]]


In [51]:
#train
print("Accuracy: Overall, how often is the classifier correct?", ((TP_train_knn+TN_train_knn)/total_train_knn))
print("Misclassification Rate: Overall, how often is it wrong?", ((FP_train_knn+FN_train_knn)/total_train_knn))
print("True Positive Rate: When it's actually yes, how often does it predict yes?", (TP_train_knn/actual_train_yes_knn))
print("False Positive Rate: When it's actually no, how often does it predict yes?", (FP_train_knn/actual_train_no_knn))
print("Specificity: When it's actually no, how often does it predict no?", (TN_train_knn/actual_train_no_knn))
print("Precision: When it predicts yes, how often is it correct?", (TP_train_knn/predicted_train_yes_knn))
print("Prevalence: How often does the yes condition actually occur in our sample?", actual_train_yes_knn/total_train_knn)
print("\n")
#test
print("Accuracy: Overall, how often is the classifier correct?", ((TP_test_knn+TN_test_knn)/total_test_knn))
print("Misclassification Rate: Overall, how often is it wrong?", ((FP_test_knn+FN_test_knn)/total_test_knn))
print("True Positive Rate: When it's actually yes, how often does it predict yes?", (TP_test_knn/actual_test_yes_knn))
print("False Positive Rate: When it's actually no, how often does it predict yes?", (FP_test_knn/actual_test_no_knn))
print("Specificity: When it's actually no, how often does it predict no?", (TN_test_knn/actual_test_no_knn))
specificity = TN_test_knn/actual_test_no_knn
print("Precision: When it predicts yes, how often is it correct?", (TP_test_knn/predicted_test_yes_knn))
precision = (TP_test/predicted_test_yes)
print("Prevalence: How often does the yes condition actually occur in our sample?", actual_test_yes_knn/total_test_knn)

Accuracy: Overall, how often is the classifier correct? 0.7550752537626881
Misclassification Rate: Overall, how often is it wrong? 0.24492474623731186
True Positive Rate: When it's actually yes, how often does it predict yes? 0.7861681690557115
False Positive Rate: When it's actually no, how often does it predict yes? 0.29006650933276124
Specificity: When it's actually no, how often does it predict no? 0.7099334906672388
Precision: When it predicts yes, how often is it correct? 0.7973621103117506
Prevalence: How often does the yes condition actually occur in our sample? 0.5921421071053553


Accuracy: Overall, how often is the classifier correct? 0.7489374468723436
Misclassification Rate: Overall, how often is it wrong? 0.2510625531276564
True Positive Rate: When it's actually yes, how often does it predict yes? 0.7821964929943788
False Positive Rate: When it's actually no, how often does it predict yes? 0.3001237623762376
Specificity: When it's actually no, how often does it predict no

### Estimation KNN + evolutionary computing

In [52]:
total_true_predictions = df[df['funded_prediction'] == True]['funded_prediction'].count()
print(total_true_predictions)
total_false_predictions = df[df['funded_prediction'] == False]['funded_prediction'].count()
print(total_false_predictions)

specificity_estimate = (specificity * total_false_predictions)
print("Specificity: When it's actually no, how often does it predict no? ", specificity_estimate, "/", total_false_predictions)
precision_estimate = (precision * total_true_predictions)
print("Precision: When it predicts yes, how often is it correct? ", precision_estimate, "/", total_true_predictions )

estimate = specificity_estimate + precision_estimate
print("estimate: ", int(round(estimate)))

46859
31206
Specificity: When it's actually no, how often does it predict no?  21840.33787128713 / 31206
Precision: When it predicts yes, how often is it correct?  37014.876861794255 / 46859
estimate:  58855


# Most important features

In [53]:
from sklearn.feature_selection import RFE
svm = LogisticRegression()
# create the RFE model for the svm classifier 
# and select attributes
rfe = RFE(svm, 1)
rfe = rfe.fit(X_test, y_test)
# print summaries for the selection of attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True False False]
[2 1 3 4]
