## Preprocessing

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../datasets/census_income_dataset.csv")

In [3]:
data = data.drop(["fnlwgt","education","relationship", "sex", "race"], axis=1)

In [4]:
data.isnull().sum()

age                  0
workclass         1836
education-num        0
marital-status       0
occupation        1843
capital-gains        0
capital-loss         0
hours-per-week       0
native-country     583
target               0
dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

features_to_convert = ["workclass","marital-status","occupation","native-country","target"]
for i in features_to_convert:
    data[i] = enc.fit_transform(data[i].astype('str'))
data.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,capital-gains,capital-loss,hours-per-week,native-country,target
0,39,6,13,4,0,2174,0,40,38,0
1,50,5,13,2,3,0,0,13,38,0
2,38,3,9,0,5,0,0,40,38,0
3,53,3,7,2,5,0,0,40,38,0
4,28,3,13,2,9,0,0,40,4,0


In [6]:
outliers = {}
for i in range(data.shape[1]):
    min_t = data[data.columns[i]].mean() - (3 * data[data.columns[i]].std())
    max_t = data[data.columns[i]].mean() + (3 * data[data.columns[i]].std())
    count = 0
    for j in data[data.columns[i]]:
        if j < min_t or j > max_t:
            count += 1
    outliers[data.columns[i]] = [count,data.shape[0]-count]
print(outliers)

{'age': [121, 32440], 'workclass': [0, 32561], 'education-num': [219, 32342], 'marital-status': [0, 32561], 'occupation': [0, 32561], 'capital-gains': [215, 32346], 'capital-loss': [1470, 31091], 'hours-per-week': [440, 32121], 'native-country': [1113, 31448], 'target': [0, 32561]}


## Activity 11

In [7]:
X = data.drop("target", axis=1)
Y = data["target"]

In [8]:
from sklearn.model_selection import train_test_split
X_new, X_test, Y_new, Y_test = train_test_split(X, Y, test_size=0.1, random_state=101)
X_train, X_dev, Y_train, Y_dev = train_test_split(X_new, Y_new, test_size=0.1111, random_state=101)
print(X_train.shape, X_dev.shape, X_test.shape, Y_train.shape, Y_dev.shape, Y_test.shape)

(26048, 9) (3256, 9) (3257, 9) (26048,) (3256,) (3257,)


In [9]:
from sklearn.naive_bayes import GaussianNB
model_NB = GaussianNB()
model_NB.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [10]:
pred_1 = model_NB.predict([[39,6,13,4,0,2174,0,40,38]])
print(pred_1)

[0]


## Activity 12

In [11]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier(random_state=101)
model_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=101,
            splitter='best')

In [12]:
pred_2 = model_tree.predict([[39,6,13,4,0,2174,0,40,38]])
print(pred_2)

[0]


## Activity 13

In [13]:
from sklearn.svm import SVC
model_svm = SVC(random_state=101)
model_svm.fit(X_train, Y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=101,
  shrinking=True, tol=0.001, verbose=False)

In [14]:
pred_3 = model_svm.predict([[39,6,13,4,0,2174,0,40,38]])
print(pred_3)

[0]


## Error analysis

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
X_sets = [X_train, X_dev, X_test]
Y_sets = [Y_train, Y_dev, Y_test]

metrics = {"Naive Bayes":{"Accuracy":[],"Precision":[],"Recall":[]},"Decision Tree":{"Accuracy":[],"Precision":[],"Recall":[]},"SVM":{"Accuracy":[],"Precision":[],"Recall":[]}}

for i in range(0,len(X_sets)):
    
    pred_NB = model_NB.predict(X_sets[i])
    metrics["Naive Bayes"]["Accuracy"].append(accuracy_score(Y_sets[i], pred_NB))
    metrics["Naive Bayes"]["Precision"].append(precision_score(Y_sets[i], pred_NB))
    metrics["Naive Bayes"]["Recall"].append(recall_score(Y_sets[i], pred_NB))
    
    pred_tree = model_tree.predict(X_sets[i])
    metrics["Decision Tree"]["Accuracy"].append(accuracy_score(Y_sets[i], pred_tree))
    metrics["Decision Tree"]["Precision"].append(precision_score(Y_sets[i], pred_tree))
    metrics["Decision Tree"]["Recall"].append(recall_score(Y_sets[i], pred_tree))
    
    pred_svm = model_svm.predict(X_sets[i])
    metrics["SVM"]["Accuracy"].append(accuracy_score(Y_sets[i], pred_svm))
    metrics["SVM"]["Precision"].append(precision_score(Y_sets[i], pred_svm))
    metrics["SVM"]["Recall"].append(recall_score(Y_sets[i], pred_svm))

metrics

{'Naive Bayes': {'Accuracy': [0.7970669533169533,
   0.7905405405405406,
   0.8084126496776174],
  'Precision': [0.6683725690890481, 0.6816901408450704, 0.6873239436619718],
  'Recall': [0.3122907699665232, 0.29839704069050554, 0.32232496697490093]},
 'Decision Tree': {'Accuracy': [0.9723587223587223,
   0.812039312039312,
   0.8228431071538226],
  'Precision': [0.9827856025039123, 0.6307490144546649, 0.6212938005390836],
  'Recall': [0.9010043041606887, 0.591861898890259, 0.6089828269484808]},
 'SVM': {'Accuracy': [0.9119702088452089,
   0.8015970515970516,
   0.81486030089039],
  'Precision': [0.8976818545163869, 0.6987951807228916, 0.6964285714285714],
  'Recall': [0.7160848079069027, 0.35758323057953145, 0.3606340819022457]}}