<a href="https://colab.research.google.com/github/RyanUdeM/De-Bruijn-Graph/blob/master/Income_NB_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# url = '/content/gdrive/My Drive/Colab Notebooks/IFT6390/Project/Data/'
url = '/content/gdrive/My Drive/IFT6390 Final Project/Project/data/'
train_url = url + 'income_train.csv'
test_url = url + 'income_test.csv'

In [5]:
income_train = pd.read_csv(train_url, index_col=0)
income_test = pd.read_csv(test_url, index_col=0)
print("income training data: ", income_train.shape)
print("income testing  data: ", income_test.shape)
income_train.head()

income training data:  (32561, 15)
income testing  data:  (16281, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [0]:
X_train = income_train.iloc[:,:14]
y_train = income_train.iloc[:,-1]

X_test  = income_test.iloc[:,:-1]
y_test  = income_test.iloc[:,-1]

In [0]:
# rescale data to 0-1, otherwise SVC will take forever
# https://datascience.stackexchange.com/questions/989/svm-using-scikit-learn-runs-endlessly-and-never-completes-execution
scaling = MinMaxScaler(feature_range=(0,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

### **Income data is imbalance in training data [24720,  7841]**
###  we need to consider **F1** score rather than accuracy
 https://medium.com/analytics-vidhya/accuracy-vs-f1-score-6258237beca2

 https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html

# **Income NB**

In [10]:
import sklearn.naive_bayes as nb 

NB_scores = {}

name_model_map = {
    'GaussianNB':nb.GaussianNB(),
    'MultinomialNB':nb.MultinomialNB(),
    'ComplementNB':nb.ComplementNB(),
    'BernoulliNB' :nb.BernoulliNB(), 
    }

for key,model in name_model_map.items():
    result_model = model.fit(X_train, y_train) 
    train_score = model.score(X_train,y_train)
    test_score  = model.score(X_test, y_test)
    y_train_pred = result_model.predict(X_train)
    y_test_pred  = result_model.predict(X_test)
    
    # report = classification_report(testY, predY)
    # precision, recall, f1 score
    precision_train, recall_train, f1_train, non = precision_recall_fscore_support(y_train, y_train_pred, average='weighted')
    precision_test , recall_test , f1_test , non = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')
    print('----------- '+key+' --------------')
    print("train score: ", train_score)
    print("train f1   : ", f1_train)
    print("test socre:  ", test_score)
    print("test f1   :  ", f1_test)

    # save model performence into dict
    NB_scores[key] = (train_score, test_score, f1_train, f1_test)

# transform disc to pd dataframe for easy plot later
NB_scores = pd.DataFrame.from_dict(NB_scores, orient='index', columns=['train_score','test_score', 'train_f1','test_f1'])

----------- GaussianNB --------------
train score:  0.8057799207641043
train f1   :  0.7810499935257954
test socre:   0.8057858853878754
test f1   :   0.7810869467846596
----------- MultinomialNB --------------
train score:  0.7641042965510887
train f1   :  0.6668424587916032
test socre:   0.768994533505313
test f1   :   0.6737673194959116
----------- ComplementNB --------------
train score:  0.6852983630723872
train f1   :  0.7080633419959721
test socre:   0.6885326454149008
test f1   :   0.7119313091277791
----------- BernoulliNB --------------
train score:  0.7287245477718743
train f1   :  0.7453611611543444
test socre:   0.733615871260979
test f1   :   0.7503805120626919


In [11]:
NB_scores

Unnamed: 0,train_score,test_score,train_f1,test_f1
GaussianNB,0.80578,0.805786,0.78105,0.781087
MultinomialNB,0.764104,0.768995,0.666842,0.673767
ComplementNB,0.685298,0.688533,0.708063,0.711931
BernoulliNB,0.728725,0.733616,0.745361,0.750381


# **Income SVM**

note that it takes 3 minutes to train a model, and take about 30 sec to predict

In [34]:
start = time.time()
from sklearn.svm import SVC

SVC_scores = []

SVC_kernels = ['linear', 'poly', 'rbf', 'sigmoid']
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for kernel in SVC_kernels:
    for c in C_values:
        model = SVC(C=c, kernel=kernel, gamma='auto')
        result_model = model.fit(X_train, y_train) 
        train_score = model.score(X_train,y_train)
        test_score  = model.score(X_test, y_test)
        y_train_pred = result_model.predict(X_train)
        y_test_pred  = result_model.predict(X_test)
        
        # report = classification_report(testY, predY)
        # precision, recall, f1 score
        precision_train, recall_train, f1_train, non = precision_recall_fscore_support(y_train, y_train_pred, average='weighted')
        precision_test , recall_test , f1_test , non = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')
        print('----------- '+kernel+" C=" + str(c)+' --------------')
        print("train score: ", train_score)
        print("train f1   : ", f1_train)
        print("test socre:  ", test_score)
        print("test f1   :  ", f1_test)
        
        # save model performence into dict
        SVC_scores.append([kernel,c, train_score, test_score, f1_train, f1_test])


# transform disc to pd dataframe for easy plot later
SVC_scores = pd.DataFrame(SVC_scores, orient='index', columns=['kernel','C','train_score','test_score', 'train_f1','test_f1'])

end = time.time()

print(end-start)

  'precision', 'predicted', average, warn_for)


----------- linear C=0.001 --------------
train score:  0.7591904425539756
train f1   :  0.6552674618087769
test socre:   0.7637737239727289
test f1   :   0.6614797504945593
----------- linear C=0.01 --------------
train score:  0.7707994226221554
train f1   :  0.6826610090176343
test socre:   0.7745224494809901
test f1   :   0.6871952393382378
----------- linear C=0.1 --------------
train score:  0.8122293541353153
train f1   :  0.7778947546178684
test socre:   0.8136478103310607
test f1   :   0.7795913943568984
----------- linear C=1 --------------
train score:  0.8158840330456681
train f1   :  0.7824896021330828
test socre:   0.8148148148148148
test f1   :   0.7809046226253772
----------- linear C=10 --------------
train score:  0.8148398390712815
train f1   :  0.7825268524956587
test socre:   0.8133407038879675
test f1   :   0.7804075554146973
----------- linear C=100 --------------
train score:  0.8148398390712815
train f1   :  0.782887059605621
test socre:   0.8133407038879675
te

  'precision', 'predicted', average, warn_for)


----------- poly C=0.001 --------------
train score:  0.7591904425539756
train f1   :  0.6552674618087769
test socre:   0.7637737239727289
test f1   :   0.6614797504945593


  'precision', 'predicted', average, warn_for)


----------- poly C=0.01 --------------
train score:  0.7591904425539756
train f1   :  0.6552674618087769
test socre:   0.7637737239727289
test f1   :   0.6614797504945593
----------- poly C=0.1 --------------
train score:  0.7624151592395811
train f1   :  0.6628350271190663
test socre:   0.7673361587126097
test f1   :   0.6698375164307044
----------- poly C=1 --------------
train score:  0.8195694235435029
train f1   :  0.7942442409152962
test socre:   0.819667096615687
test f1   :   0.7939955158377233
----------- poly C=10 --------------
train score:  0.8362765271336875
train f1   :  0.8218606433237396
test socre:   0.834653891038634
test f1   :   0.8196006516251994
----------- poly C=100 --------------
train score:  0.8461656583028777
train f1   :  0.8360063901933305
test socre:   0.8477366255144033
test f1   :   0.8374008570351269
----------- poly C=1000 --------------
train score:  0.8503731457879058
train f1   :  0.8427897846279838
test socre:   0.8493335790184878
test f1   :   0.

  'precision', 'predicted', average, warn_for)


----------- rbf C=0.001 --------------
train score:  0.7591904425539756
train f1   :  0.6552674618087769
test socre:   0.7637737239727289
test f1   :   0.6614797504945593


  'precision', 'predicted', average, warn_for)


----------- rbf C=0.01 --------------
train score:  0.7591904425539756
train f1   :  0.6552674618087769
test socre:   0.7637737239727289
test f1   :   0.6614797504945593
----------- rbf C=0.1 --------------
train score:  0.7939559595835509
train f1   :  0.7375319116275544
test socre:   0.7963884282292243
test f1   :   0.7400539512863883
----------- rbf C=1 --------------
train score:  0.8325297134608888
train f1   :  0.8165184359788591
test socre:   0.8321970394938886
test f1   :   0.8158893807835285
----------- rbf C=10 --------------
train score:  0.8451521759159731
train f1   :  0.8346041677238231
test socre:   0.8455868804127511
test f1   :   0.835160980763411
----------- rbf C=100 --------------
train score:  0.8498817603881945
train f1   :  0.8420033739182267
test socre:   0.8497635280388183
test f1   :   0.8416234697831928
----------- rbf C=1000 --------------
train score:  0.8506495500752433
train f1   :  0.8435760737129481
test socre:   0.8493335790184878
test f1   :   0.84173

  'precision', 'predicted', average, warn_for)


----------- sigmoid C=0.001 --------------
train score:  0.7591904425539756
train f1   :  0.6552674618087769
test socre:   0.7637737239727289
test f1   :   0.6614797504945593


  'precision', 'predicted', average, warn_for)


----------- sigmoid C=0.01 --------------
train score:  0.7591904425539756
train f1   :  0.6552674618087769
test socre:   0.7637737239727289
test f1   :   0.6614797504945593
----------- sigmoid C=0.1 --------------
train score:  0.7641042965510887
train f1   :  0.6667882895363649
test socre:   0.7690559547939316
test f1   :   0.6738001288618976
----------- sigmoid C=1 --------------
train score:  0.8076840391879856
train f1   :  0.7769464267913724
test socre:   0.8093483201277563
test f1   :   0.7786984414393847
----------- sigmoid C=10 --------------
train score:  0.7572863241300943
train f1   :  0.7522127287884437
test socre:   0.7529635771758492
test f1   :   0.7487382940275052
----------- sigmoid C=100 --------------
train score:  0.7344983262184822
train f1   :  0.7342136717619496
test socre:   0.7256311037405565
test f1   :   0.7266362146398694
----------- sigmoid C=1000 --------------
train score:  0.7324406498571912
train f1   :  0.732452297221287
test socre:   0.72354277992752

AttributeError: ignored

In [38]:
SVC_scores = pd.DataFrame(SVC_scores, columns=['kernel','C','train_score','test_score', 'train_f1','test_f1'])
SVC_scores

Unnamed: 0,kernel,C,train_score,test_score,train_f1,test_f1
0,linear,0.001,0.75919,0.763774,0.655267,0.66148
1,linear,0.01,0.770799,0.774522,0.682661,0.687195
2,linear,0.1,0.812229,0.813648,0.777895,0.779591
3,linear,1.0,0.815884,0.814815,0.78249,0.780905
4,linear,10.0,0.81484,0.813341,0.782527,0.780408
5,linear,100.0,0.81484,0.813341,0.782887,0.780671
6,linear,1000.0,0.815055,0.813586,0.783234,0.781351
7,poly,0.001,0.75919,0.763774,0.655267,0.66148
8,poly,0.01,0.75919,0.763774,0.655267,0.66148
9,poly,0.1,0.762415,0.767336,0.662835,0.669838


In [0]:
NB_scores.to_csv(url+'income_NB.csv')
SVC_scores.to_csv(url+'income_SVC.csv')