## Preprocessing

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../datasets/adult.data",names=["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship",
        "race","sex","capital-gain",
        "capital-loss","hours-per-week","native-country","target"])
data.shape

(32561, 15)

In [3]:
data = data.drop(["fnlwgt","education","relationship", "sex", "race"], axis=1)

In [4]:
data.isnull().sum()

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
target            0
dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

features_to_convert = ["workclass","marital-status","occupation","native-country","target"]
for i in features_to_convert:
    data[i] = enc.fit_transform(data[i].astype('str'))
data.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,7,13,4,1,2174,0,40,39,0
1,50,6,13,2,4,0,0,13,39,0
2,38,4,9,0,6,0,0,40,39,0
3,53,4,7,2,6,0,0,40,39,0
4,28,4,13,2,10,0,0,40,5,0


In [6]:
outliers = {}
for i in range(data.shape[1]):
    min_t = data[data.columns[i]].mean() - (3 * data[data.columns[i]].std())
    max_t = data[data.columns[i]].mean() + (3 * data[data.columns[i]].std())
    count = 0
    for j in data[data.columns[i]]:
        if j < min_t or j > max_t:
            count += 1
    outliers[data.columns[i]] = [count,data.shape[0]-count]
print(outliers)

{'age': [121, 32440], 'workclass': [0, 32561], 'education-num': [219, 32342], 'marital-status': [0, 32561], 'occupation': [0, 32561], 'capital-gain': [215, 32346], 'capital-loss': [1470, 31091], 'hours-per-week': [440, 32121], 'native-country': [1505, 31056], 'target': [0, 32561]}


## Activity 1

In [7]:
X = data.drop("target", axis=1)
Y = data["target"]

In [8]:
from sklearn.model_selection import train_test_split
X_new, X_test, Y_new, Y_test = train_test_split(X, Y, test_size=0.1, random_state=101)
X_train, X_dev, Y_train, Y_dev = train_test_split(X_new, Y_new, test_size=0.1111, random_state=101)
print(X_train.shape, X_dev.shape, X_test.shape, Y_train.shape, Y_dev.shape, Y_test.shape)

(26048, 9) (3256, 9) (3257, 9) (26048,) (3256,) (3257,)


In [9]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(random_state=101)
model.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=101,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [10]:
from sklearn.metrics import accuracy_score
X_sets = [X_train, X_dev, X_test]
Y_sets = [Y_train, Y_dev, Y_test]

accuracy = []

for i in range(0,len(X_sets)):
    
    pred = model.predict(X_sets[i])
    score = accuracy_score(Y_sets[i], pred)
    accuracy.append(score)

accuracy

[0.789120085995086, 0.7767199017199017, 0.7958243782622045]

## Performance analysis section

In [11]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(random_state=101, max_iter=500, hidden_layer_sizes = (150,150))
model.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(150, 150), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=101,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [12]:
from sklearn.metrics import accuracy_score
X_sets = [X_train, X_dev, X_test]
Y_sets = [Y_train, Y_dev, Y_test]

accuracy = []

for i in range(0,len(X_sets)):
    
    pred = model.predict(X_sets[i])
    score = accuracy_score(Y_sets[i], pred)
    accuracy.append(score)

accuracy

[0.8187192874692875, 0.8092751842751843, 0.8219220141234265]