## Importing Datasets and Libraries

In [57]:
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import scipy
%matplotlib inline
import matplotlib.pyplot as plt
import random
from sklearn.metrics import f1_score
import collections
import operator

In [2]:
adult_train_data = pd.read_csv("adult_traing_set.csv")
adult_test_data = pd.read_csv("adult_testing_set.csv")
# this column is only in the test set so we can drop it
adult_test_data = adult_test_data.drop(columns={'Holand-Netherlands'})

In [3]:
for i in adult_test_data.columns:
    if i in adult_train_data.columns:
        continue
    else:
        print(i)

In [4]:
len(adult_train_data)

15060

## Machine Learning Models

### KNN

In [5]:
# Todo: fix hard-coded col name
def knn(k, train_set, validation_set):
    y_train = train_set['annual-income']
    x_train = train_set.loc[:, train_set.columns != 'annual-income']

    n_neighbors = k
    
    ## create KNN instance
    knn_model = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
    
    ## fit the model
    knn_model.fit(x_train, y_train)
    
    ## Make prediction
    y_pred = knn_model.predict(x_train)
    acc = np.mean(y_pred == y_train)
        
    ## Checking on Validation Set
    # Divide validation set into input and actual label
    y_vald = validation_set['annual-income']
    x_vald = validation_set.loc[:, validation_set.columns != 'annual-income']
    
    # use the model created above to predict label
    y_pred_vald = knn_model.predict(x_vald)
    
    # acc_new is a measure of accuracy 
    acc_new = round(np.mean(y_pred_vald == y_vald), 4)
    
    # Todo: check which to remove acc_new or accuracy_score
    accuracy_score = metrics.accuracy_score(y_vald, y_pred_vald)
    
    return acc_new, y_pred_vald, accuracy_score
    

### Decision Tree

In [6]:
def decision_tree(k, train_set, validation_set):
    y_train = train_set['annual-income']
    x_train = train_set.loc[:, train_set.columns != 'annual-income']
    dec_model = DecisionTreeClassifier(max_depth = k)
    dec_model.fit(x_train, y_train)
    y_pred = dec_model.predict(x_train)
    
    acc = np.mean(y_pred == y_train)
    #print("Train Set Error", f'{acc:.4f}') 
    
    y_vald = validation_set['annual-income']
    x_vald = validation_set.loc[:, validation_set.columns != 'annual-income']
    
    y_pred_vald = dec_model.predict(x_vald)
    acc_new = round(np.mean(y_pred_vald == y_vald), 4)
    
    accuracy_score = metrics.accuracy_score(y_vald, y_pred_vald)
    
    
    #print("Validation Set Error", acc_new)
    
    return acc_new, y_pred_vald, accuracy_score
    

## K-Cross Validation

#### Dividing the DataFrames

In [7]:
# divide the dataframe - this is used for cross validation
def divide_dataframe(k, df):
    ## getting total rows in the training set
    length = len(df)
    segment_size = int(length/k)
    
    array_df = []
    new_test = []
    x = 0
    
    while(x < length):
        array_df.append(df.iloc[x: x+segment_size])
        x+= segment_size
        
    return array_df

#### Cross Validation

In [38]:
## Cross Validation
def cross_validation(k, train_data):
    
    '''
    1. Get Arrays by dividing the set
    2. Assign one row to the validation set
    3. Assign the rest to the training set
    '''
    knn_final_score = 0
    dt_final_score = 0
    best_k = []
    df_array = divide_dataframe(k, train_data)
    knn_score = []
    knn_scores_list = collections.defaultdict(dict)
    dt_score = []
    dt_scores_list = collections.defaultdict(dict)
    
    for i in range(len(df_array)):
        
        validation_set = df_array[i]
        train_set = pd.DataFrame()
        
        for df in range(len(df_array)):
            if df == i:
                continue
            else:
                train_set = train_set.append(df_array[df])
        knn_error_counter = 0
        best_kk = 0
        for neigh in range(1,13):
            knscore, error, accuracy_score = knn(neigh, train_set, validation_set)
            knn_scores_list[i][neigh] = knscore
            if (knscore > knn_error_counter):
                knn_error_counter = knscore
                best_kk = neigh
            
            knn_score.append(accuracy_score)
            
        knn_final_score += knn_error_counter
        best_k.append(best_kk)
        
        ## Getting the total scores and plotting the graph
        print("KNN: ",i, knn_scores_list)
#         plt.plot(range(1,26), knn_score)
#         plt.xlabel('# Neighbours')
#         plt.ylabel('Testing Accuracy')
        dtt_error_counter = 0
        best_dtt = 0
        for depp in range(1,13):
            dttscore, error2, accuracy_score2 = decision_tree(depp, train_set, validation_set)
            dt_scores_list[i][depp] = dttscore
            if (dttscore > dtt_error_counter):
                dtt_error_counter = dttscore
                best_dtt = depp
            
            dt_score.append(accuracy_score2)
            
        dt_final_score += dtt_error_counter
        best_k.append(best_dtt)
        print("Decision Trees: ",i, dt_scores_list)
        print("\n")
    
    knn_final_score = round(knn_final_score/k,2)
    dec_final_score = dt_final_score/k
    
    print("KNN Score is: ", knn_final_score)
    print("Decision Tree Score is: ", dec_final_score)
    #print("k list is: ", best_k)
    return knn_score, knn_scores_list, knn_final_score, dec_final_score, dt_score, dt_scores_list



In [80]:
## This block of code varies size of subsets of the training/validation data 

size_of_dataset = 0.2
final_knn_score = []
final_knn_scores_list = {}
final_dt_score = []
final_dt_scores_list = {}
final_error_score_knn = 0
final_error_score_dtt = 0
while(size_of_dataset <= 1):
    size_of_dataset = round(size_of_dataset,2)
    new_size = int(len(adult_train_data) * size_of_dataset)
    print(new_size)
    new_dataset = adult_train_data[:new_size+1]
    print("Starting Cross Validation for : ", size_of_dataset)
    knn_score, knn_scores_list, knn_final_score, dec_final_score, dt_score, dt_scores_list = cross_validation(5, new_dataset)
    if(size_of_dataset == 1):
        final_knn_score = knn_score
        final_knn_scores_list = knn_scores_list
        final_dt_score = dt_score
        final_dt_scores_list = dt_scores_list
        final_error_score_knn = knn_final_score
        final_error_score_dtt = dec_final_score
    size_of_dataset += 0.2

3012
Starting Cross Validation for :  0.2
KNN:  0 defaultdict(<class 'dict'>, {0: {1: 0.6761, 2: 0.7658, 3: 0.7309, 4: 0.7708, 5: 0.7641, 6: 0.7807, 7: 0.7841, 8: 0.7757, 9: 0.7741, 10: 0.7774, 11: 0.7791, 12: 0.7757}})
Decision Trees:  0 defaultdict(<class 'dict'>, {0: {1: 0.7591, 2: 0.8156, 3: 0.8256, 4: 0.8306, 5: 0.8488, 6: 0.8422, 7: 0.8439, 8: 0.8405, 9: 0.8355, 10: 0.8355, 11: 0.8322, 12: 0.8256}})


KNN:  1 defaultdict(<class 'dict'>, {0: {1: 0.6761, 2: 0.7658, 3: 0.7309, 4: 0.7708, 5: 0.7641, 6: 0.7807, 7: 0.7841, 8: 0.7757, 9: 0.7741, 10: 0.7774, 11: 0.7791, 12: 0.7757}, 1: {1: 0.6894, 2: 0.7625, 3: 0.7508, 4: 0.7625, 5: 0.7492, 6: 0.7608, 7: 0.7641, 8: 0.7608, 9: 0.7658, 10: 0.7625, 11: 0.7625, 12: 0.7591}})
Decision Trees:  1 defaultdict(<class 'dict'>, {0: {1: 0.7591, 2: 0.8156, 3: 0.8256, 4: 0.8306, 5: 0.8488, 6: 0.8422, 7: 0.8439, 8: 0.8405, 9: 0.8355, 10: 0.8355, 11: 0.8322, 12: 0.8256}, 1: {1: 0.7292, 2: 0.7807, 3: 0.8123, 4: 0.8189, 5: 0.8239, 6: 0.8223, 7: 0.8206, 8:

KNN:  3 defaultdict(<class 'dict'>, {0: {1: 0.6971, 2: 0.7701, 3: 0.7452, 4: 0.7801, 5: 0.7643, 6: 0.7801, 7: 0.7734, 8: 0.7776, 9: 0.7734, 10: 0.7784, 11: 0.7759, 12: 0.7776}, 1: {1: 0.6929, 2: 0.7693, 3: 0.7519, 4: 0.7793, 5: 0.7726, 6: 0.7817, 7: 0.7759, 8: 0.7859, 9: 0.7834, 10: 0.7826, 11: 0.7817, 12: 0.7801}, 2: {1: 0.6888, 2: 0.7793, 3: 0.7361, 4: 0.7801, 5: 0.7519, 6: 0.7909, 7: 0.7768, 8: 0.7934, 9: 0.7925, 10: 0.7909, 11: 0.7967, 12: 0.7876}, 3: {1: 0.688, 2: 0.771, 3: 0.7386, 4: 0.7685, 5: 0.761, 6: 0.7784, 7: 0.7676, 8: 0.7751, 9: 0.7743, 10: 0.7759, 11: 0.7751, 12: 0.7751}})
Decision Trees:  3 defaultdict(<class 'dict'>, {0: {1: 0.7444, 2: 0.7983, 3: 0.81, 4: 0.8332, 5: 0.8241, 6: 0.8266, 7: 0.8307, 8: 0.8373, 9: 0.8398, 10: 0.8315, 11: 0.8274, 12: 0.8249}, 1: {1: 0.7568, 2: 0.8033, 3: 0.8249, 4: 0.8382, 5: 0.8556, 6: 0.8498, 7: 0.8481, 8: 0.8465, 9: 0.8448, 10: 0.8415, 11: 0.844, 12: 0.8465}, 2: {1: 0.7668, 2: 0.805, 3: 0.8257, 4: 0.8266, 5: 0.8349, 6: 0.844, 7: 0.8531, 8

Decision Trees:  5 defaultdict(<class 'dict'>, {0: {1: 0.7504, 2: 0.7991, 3: 0.8185, 4: 0.8268, 5: 0.8284, 6: 0.8307, 7: 0.8323, 8: 0.8318, 9: 0.8351, 10: 0.8257, 11: 0.8218, 12: 0.8246}, 1: {1: 0.7615, 2: 0.8046, 3: 0.8262, 4: 0.8312, 5: 0.8412, 6: 0.8428, 7: 0.8428, 8: 0.8467, 9: 0.8506, 10: 0.839, 11: 0.8367, 12: 0.8445}, 2: {1: 0.7454, 2: 0.7858, 3: 0.8046, 4: 0.8229, 5: 0.845, 6: 0.834, 7: 0.8467, 8: 0.8495, 9: 0.8484, 10: 0.8445, 11: 0.8445, 12: 0.8329}, 3: {1: 0.7698, 2: 0.8157, 3: 0.8279, 4: 0.834, 5: 0.8478, 6: 0.8484, 7: 0.8594, 8: 0.8633, 9: 0.8589, 10: 0.8533, 11: 0.845, 12: 0.8484}, 4: {1: 0.7471, 2: 0.7869, 3: 0.8041, 4: 0.8163, 5: 0.8207, 6: 0.8257, 7: 0.8273, 8: 0.8323, 9: 0.8334, 10: 0.8301, 11: 0.8296, 12: 0.8329}, 5: {1: 0.5, 2: 0.5, 3: 0.5, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0}})


KNN Score is:  0.89
Decision Tree Score is:  1.04638
12048
Starting Cross Validation for :  0.8
KNN:  0 defaultdict(<class 'dict'>, {0: {1: 0.6924, 2:

KNN:  2 defaultdict(<class 'dict'>, {0: {1: 0.7062, 2: 0.7766, 3: 0.7414, 4: 0.7819, 5: 0.7613, 6: 0.7905, 7: 0.7776, 8: 0.7908, 9: 0.7885, 10: 0.7945, 11: 0.7925, 12: 0.7922}, 1: {1: 0.7218, 2: 0.7825, 3: 0.7566, 4: 0.7865, 5: 0.7759, 6: 0.7835, 7: 0.7815, 8: 0.7892, 9: 0.7888, 10: 0.7952, 11: 0.7938, 12: 0.7908}, 2: {1: 0.6969, 2: 0.7686, 3: 0.7437, 4: 0.7742, 5: 0.7546, 6: 0.7809, 7: 0.7706, 8: 0.7855, 9: 0.7815, 10: 0.7859, 11: 0.7825, 12: 0.7869}})
Decision Trees:  2 defaultdict(<class 'dict'>, {0: {1: 0.7533, 2: 0.8025, 3: 0.8194, 4: 0.833, 5: 0.839, 6: 0.8453, 7: 0.8459, 8: 0.845, 9: 0.8426, 10: 0.841, 11: 0.841, 12: 0.8413}, 1: {1: 0.7533, 2: 0.7948, 3: 0.8127, 4: 0.829, 5: 0.8413, 6: 0.8476, 7: 0.8456, 8: 0.8523, 9: 0.8516, 10: 0.8499, 11: 0.8519, 12: 0.8509}, 2: {1: 0.7576, 2: 0.7971, 3: 0.8164, 4: 0.8254, 5: 0.837, 6: 0.8396, 7: 0.839, 8: 0.8433, 9: 0.8433, 10: 0.844, 11: 0.8403, 12: 0.84}})


KNN:  3 defaultdict(<class 'dict'>, {0: {1: 0.7062, 2: 0.7766, 3: 0.7414, 4: 0.781

In [102]:
# 5 fold cross validaion 
# Every validation has 12 K values so
kk_best_neighbors = {}
for i in range(0,5):
    for j in range(1,13):
            if(j in kk_best_neighbors):
                kk_best_neighbors[j] += final_knn_scores_list[i][j] 
            else:
                kk_best_neighbors[j] = final_knn_scores_list[i][j]

for k in range(1,13):
    kk_best_neighbors[k] = round(kk_best_neighbors[k]/5,2)

    
print(kk_best_neighbors)
    
dt_best_neighbors = {}
for i in range(0,5):
    for j in range(1,13):
            if(j in dt_best_neighbors):
                dt_best_neighbors[j] += final_dt_scores_list[i][j]
            else:
                dt_best_neighbors[j] = final_dt_scores_list[i][j]

for k in range(1,13):
    dt_best_neighbors[k] = round(dt_best_neighbors[k]/5,2)
    
print(dt_best_neighbors)

{1: 0.71, 2: 0.77, 3: 0.74, 4: 0.78, 5: 0.76, 6: 0.78, 7: 0.77, 8: 0.79, 9: 0.78, 10: 0.79, 11: 0.79, 12: 0.79}
{1: 0.75, 2: 0.8, 3: 0.82, 4: 0.83, 5: 0.84, 6: 0.85, 7: 0.85, 8: 0.85, 9: 0.85, 10: 0.85, 11: 0.85, 12: 0.85}


In [103]:
## So 10 looks good for KNN
## and depth 8 looks good for Decision Trees

## The final K-NN error Test
knn_error_score = 0
for i in range(0,5):
    knn_error_score += knn_scores_list[i][10] 
print("Final Error with 10 K: ", round(knn_error_score/5,2))

dt_error_score = 0
for i in range(0,5):
    dt_error_score += dt_scores_list[i][8] 
print("Final Error with depth of 8 with Decision Trees: ", round(dt_error_score/5,2))

Final Error with 10 K:  0.79
Final Error with depth of 8 with Decision Trees:  0.85


## Running on our Test Set

In [None]:
### Now that we have our paramaters 
### We run on our test train set

In [84]:
knn_error, knn_prediction, acccuracy_test = knn(10, adult_train_data, adult_test_data)
print(knn_error, knn_prediction, acccuracy_test)

0.7866 ['<=50K' '<=50K' '<=50K' ... '<=50K' '<=50K' '>50K'] 0.7865526158742789


In [78]:
dt_error, dt_prediction, test = decision_tree(8, adult_train_data, adult_test_data)

## Evaluation

In [79]:
print("KNN F1 Score: ", f1_score(adult_test_data['annual-income'],knn_prediction , average=None,labels = ['<=50K', '>50K']))
print("Decision Tree F1 Score: ", f1_score(adult_test_data['annual-income'],dt_prediction , average=None,labels = ['<=50K', '>50K']))

KNN F1 Score:  [0.87331759 0.32260101]
Decision Tree F1 Score:  [0.90504369 0.6710575 ]


## Few Tests

#### From the dataset we can see that we have wayy more <=50K than we do >50K so lets try to make them even

In [None]:
def evenly_distributed(sample):
    df = pd.DataFrame()
    less_than_50 = sample.loc[sample['annual-income'] == '>50K']
    greater_than_50 = sample.loc[sample['annual-income'] == '<=50K']
    z = np.random.choice(greater_than_50.index.values, len(less_than_50))
    final_greater_than_50 = greater_than_50.loc[z]
    
    df = df.append([less_than_50, final_greater_than_50], ignore_index=True)
    df = df.sample(frac=1).reset_index(drop=True)
    
    return df
    
    
    
    
# print(self.reviews[0])

In [None]:
new_train_data = evenly_distributed(adult_train_data)
new_test_data = evenly_distributed(adult_test_data)

### Now testing Everything again

In [None]:
knn_error_one, knn_prediction_one = knn(new_train_data, new_test_data)

In [None]:
dt_error_one, dt_prediction_one = decision_tree(new_train_data, new_test_data)

In [None]:
print("KNN F1 Score: ", f1_score(new_test_data['annual-income'],knn_prediction_one , average=None,labels = ['<=50K', '>50K']))
print("Decision Tree F1 Score: ", f1_score(new_test_data['annual-income'],dt_prediction_one , average=None,labels = ['<=50K', '>50K']))


In [35]:
test = collections.defaultdict(dict)
test[2][2] = 3

{2: 3}