In [38]:
%matplotlib inline

In [39]:
#from jupyterthemes.stylefx import set_nb_theme
#set_nb_theme("chesterish")

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import LinearSVC, SVC, OneClassSVM
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, classification_report,make_scorer

### Support Vector Machines

In [41]:
income_data = pd.read_csv('data/adult.data', header=None, sep=", ", engine='python')
income_data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", 
                       "marital-statu", "occupation", "relationship", "race", "sex", 
                       "capital-gain", "capital-loss", "hours-per-week", "native-country", "income_class"]

In [42]:
income_attributes = income_data.drop(columns= ["income_class"])
income_classes = income_data.income_class
income_attributes

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-statu,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [43]:
income_attributes = pd.get_dummies(income_attributes)

In [44]:
scaler = MinMaxScaler()
income_attributes_scaled = scaler.fit_transform(income_attributes)

In [45]:
income_attributes_train, income_attributes_test, income_classes_train, income_classes_test = train_test_split(
income_attributes_scaled, income_classes, stratify = income_classes, test_size = 0.2)

In [46]:
income_attributes_train.shape, income_attributes_test.shape

((26048, 108), (6513, 108))

In [47]:
income_classes_train.shape, income_classes_test.shape

((26048,), (6513,))

In [48]:
linear_svc = LinearSVC(C = 1e6)

In [49]:
linear_svc.fit(income_attributes_train,income_classes_train)



In [50]:
linear_svc.coef_

array([[ 7.16824829e-01,  4.11852978e-01,  1.35994222e+00,
         1.08252963e+01,  9.18155070e-01,  1.39878472e+00,
        -6.37970027e-02, -6.93196700e-02,  3.40196835e-03,
        -1.10768693e-01, -1.39333170e-02,  1.33950365e-01,
        -3.07328384e-02,  7.16231530e-02, -1.02920083e+00,
         2.91846888e-01,  2.84735934e-01,  1.15629530e-01,
         2.73378789e-01, -1.23287511e-01, -2.41096295e-01,
         9.84267187e-02, -3.77944361e-01,  4.00396144e-01,
         2.38978143e-01, -1.21355559e-02, -1.71287425e-02,
         4.78038666e-01, -3.14054506e+00,  5.03547122e-01,
         1.18382722e-01, -3.77446181e-01,  2.21128594e-01,
         4.01004895e-01, -2.21208583e-01, -3.65523969e-01,
        -1.36119056e-01, -6.30612565e-01, -1.74565696e-01,
         2.74300227e-01,  9.46338918e-02,  3.16207767e-01,
        -6.83682960e-03, -3.61846645e-01, -2.55881000e-01,
         5.21298805e-01, -4.15469213e-01, -1.52769650e+00,
         2.75945296e-01,  3.15197618e-02, -1.95095600e-0

In [51]:
linear_grid_search = GridSearchCV(
    LinearSVC(max_iter=1000), 
    param_grid={
        "C": [0.01, 0.1, 1, 10, 100],
        "loss": ["hinge", "squared_hinge"]
    }, 
    scoring = make_scorer(f1_score, pos_label = ">50K")
)

In [52]:
linear_grid_search.fit(income_attributes_train, income_classes_train)



In [53]:
linear_grid_search.best_estimator_

In [54]:
linear_grid_search.best_params_

{'C': 10, 'loss': 'squared_hinge'}

In [55]:
linear_grid_search.cv_results_

{'mean_fit_time': array([0.07721725, 0.07881737, 0.08201804, 0.10582371, 0.13623114,
        0.35888019, 0.45110145, 1.45052505, 1.59315562, 1.77245779]),
 'std_fit_time': array([0.00271353, 0.00132651, 0.00109563, 0.00116654, 0.00172123,
        0.01580773, 0.01849812, 0.11329558, 0.05000491, 0.04746186]),
 'mean_score_time': array([0.04660997, 0.04921122, 0.04661093, 0.04641056, 0.04821053,
        0.04681125, 0.04741063, 0.04681044, 0.04681082, 0.04640217]),
 'std_score_time': array([0.00049025, 0.00203953, 0.00048938, 0.00149691, 0.00239983,
        0.00039978, 0.00101894, 0.00132716, 0.00172105, 0.00048034]),
 'param_C': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_loss': masked_array(data=['hinge', 'squared_hinge', 'hinge', 'squared_hinge',
                    'hinge', 'squared_hinge', 'hing

In [56]:
test_predictions = linear_grid_search.best_estimator_.predict(income_attributes_test)

In [57]:
f1_score(income_classes_test, test_predictions, pos_label = ">50K")

0.6595289079229122

In [58]:
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      4945
        >50K       0.75      0.59      0.66      1568

    accuracy                           0.85      6513
   macro avg       0.81      0.76      0.78      6513
weighted avg       0.85      0.85      0.85      6513



In [59]:
train_predictions = linear_grid_search.best_estimator_.predict(income_attributes_train)

In [60]:
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91     19775
        >50K       0.74      0.59      0.66      6273

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.85      0.85      0.85     26048



In [61]:
svc = SVC(kernel='poly', degree = 2, max_iter=1000)

In [62]:
svc.fit(income_attributes_train,income_classes_train)



In [63]:
svc.decision_function(income_attributes_train[:10])

array([-0.64592379,  0.59706494,  0.30297512,  0.31372567, -0.32426286,
        0.49139523, -0.24339258,  0.23738259,  0.75463843, -0.32074171])

In [64]:
svc.predict(income_attributes_train[:10])

array(['<=50K', '>50K', '>50K', '>50K', '<=50K', '>50K', '<=50K', '>50K',
       '>50K', '<=50K'], dtype=object)

In [65]:
train_predictions = svc.predict(income_attributes_train)
test_predictions = svc.predict(income_attributes_test)

In [32]:
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.90      0.72      0.80      4945
        >50K       0.46      0.76      0.57      1568

    accuracy                           0.73      6513
   macro avg       0.68      0.74      0.69      6513
weighted avg       0.80      0.73      0.75      6513



In [66]:
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.93      0.68      0.79     19775
        >50K       0.46      0.84      0.59      6273

    accuracy                           0.72     26048
   macro avg       0.69      0.76      0.69     26048
weighted avg       0.82      0.72      0.74     26048



In [67]:
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.93      0.69      0.79      4945
        >50K       0.46      0.84      0.59      1568

    accuracy                           0.72      6513
   macro avg       0.69      0.76      0.69      6513
weighted avg       0.82      0.72      0.74      6513



In [68]:
gaussian_svc = SVC(kernel='rbf',gamma  =0,1,  C = 100)

In [71]:
gaussian_svc.fit(income_attributes_train, income_classes_train)
train_predictions = svc.predict(income_attributes_train)
test_predictions = svc.predict(income_attributes_test)

In [72]:
print(classification_report(income_classes_train, train_predictions))

              precision    recall  f1-score   support

       <=50K       0.93      0.68      0.79     19775
        >50K       0.46      0.84      0.59      6273

    accuracy                           0.72     26048
   macro avg       0.69      0.76      0.69     26048
weighted avg       0.82      0.72      0.74     26048



In [73]:
print(classification_report(income_classes_test, test_predictions))

              precision    recall  f1-score   support

       <=50K       0.93      0.69      0.79      4945
        >50K       0.46      0.84      0.59      1568

    accuracy                           0.72      6513
   macro avg       0.69      0.76      0.69      6513
weighted avg       0.82      0.72      0.74      6513



In [76]:
knn = KNeighborsClassifier(n_neighbors=20)

In [78]:
knn.fit(income_attributes_train, income_classes_train)

In [79]:
knn.predict(income_attributes_train[:10])

array(['<=50K', '>50K', '>50K', '>50K', '<=50K', '<=50K', '<=50K', '>50K',
       '>50K', '<=50K'], dtype=object)

In [81]:
f1_score(income_classes_test, knn.predict(income_attributes_test), pos_label = ">50K")

0.6203966005665722

In [108]:
one_class_svm = OneClassSVM(nu = 0.01)

In [109]:
one_class_svm.fit(income_attributes_train) 

In [110]:
train_predictions = one_class_svm.predict(income_attributes_train)

In [111]:
len(train_predictions[train_predictions == -1])

270

In [112]:
len(train_predictions[train_predictions == 1])

25778

In [113]:
test_predictions = one_class_svm.predict(income_attributes_test)

In [114]:
len(test_predictions[test_predictions == -1])

84

In [115]:
len(test_predictions[test_predictions == 1])

6429