## Package initialize 

Set path to train.csv & test.csv

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder   # non-numerical labels to numerical labels
from sklearn.preprocessing import MultiLabelBinarizer   # one-hot encoding
from sklearn.preprocessing import MinMaxScaler  # normalization

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score        # evaluation metrics
from sklearn.metrics import accuracy_score

import time

# path to files
trainingDataFile = "./train.csv"
evaluateDataFile = "./test.csv"

# store preprocessing model
encoder = {}

## Functions initialize
encoding model of categorical to numerical

In [3]:
# for training data
def fitTransformByLabelEncoder(trainData, columnName):
    global encoder
    encoder[columnName] = LabelEncoder()

    # replace to original dataframe
    trainData[columnName] = encoder[columnName].fit_transform(trainData[columnName])
    return trainData

def fitTransformByMultiLableBinarizer(trainData, columnName):
    global encoder
    encoder[columnName] = MultiLabelBinarizer()
    temp = trainData[columnName].apply(lambda x: set([" ".join(w.strip().lower().split()) for w in str(x).split("/")]))   # split words by "/" to list, remove duplicated whitespace

    tempValue = encoder[columnName].fit_transform(temp)
    tempColumnName = np.char.add(columnName+"=", encoder[columnName].classes_.tolist())
    
    # concat to original dataframe
    trainData = pd.concat([trainData.drop(columnName, axis=1), pd.DataFrame(tempValue, columns=tempColumnName).set_index(trainData[columnName].index)], axis=1)   
    return trainData



# for testing data
def transformByLabelEncoder(testData, columnName):
    global encoder

    # replace to original dataframe
    testData[columnName] = encoder[columnName].transform(testData[columnName])
    return testData

def transformByMultiLableBinarizer(testData, columnName):
    global encoder
    temp = testData[columnName].apply(lambda x: set([" ".join(w.strip().lower().split()) for w in str(x).split("/")]))    # split words by "/" to list, remove duplicated whitespace

    tempValue = encoder[columnName].transform(temp)
    tempColumnName = np.char.add(columnName+"=", encoder[columnName].classes_.tolist())
    
    # concat to original dataframe
    testData = pd.concat([testData.drop(columnName, axis=1), pd.DataFrame(tempValue, columns=tempColumnName).set_index(testData[columnName].index)], axis=1)
    return testData

data cleaning 

preprocessing of training data & testing/evaluation data

In [4]:
def dataCleaning(data):
    # drop useless column
    data.drop(['education', 'fnlwgt', 'relationship'], axis=1, inplace=True)

    # remove extra space of value
    data['workclass'] = data['workclass'].apply(lambda x: x.strip())
    data['marital-status'] = data['marital-status'].apply(lambda x: x.strip())
    data['occupation'] = data['occupation'].apply(lambda x: x.strip())
    data['sex'] = data['sex'].apply(lambda x: x.strip())
    data['native-country'] = data['native-country'].apply(lambda x: x.strip())

    # replace ? to NA
    data = data.replace("?", "NA")
    return data.fillna(0)

def preprocessTrainData(trainData):
    global encoder

    # data cleaning
    trainData = dataCleaning(trainData)

    # encode non-numerical data to numerical data
    trainData = fitTransformByLabelEncoder(trainData, 'sex')

    trainData = fitTransformByMultiLableBinarizer(trainData, 'workclass')
    trainData = fitTransformByMultiLableBinarizer(trainData, 'marital-status')
    trainData = fitTransformByMultiLableBinarizer(trainData, 'occupation')
    trainData = fitTransformByMultiLableBinarizer(trainData, 'native-country')

    encoder['normalization'] = MinMaxScaler()
    trainData = encoder['normalization'].fit_transform(trainData)


    return trainData

def preprocessTestData(testData):
    global encoder

    # data cleaning
    testData = dataCleaning(testData)

    # encode non-numerical data to numerical data
    testData = transformByLabelEncoder(testData, 'sex')

    testData = transformByMultiLableBinarizer(testData, 'workclass')
    testData = transformByMultiLableBinarizer(testData, 'marital-status')
    testData = transformByMultiLableBinarizer(testData, 'occupation')
    testData = transformByMultiLableBinarizer(testData, 'native-country')
    
    testData = encoder['normalization'].transform(testData)


    return testData


## Global scope of data before modeling
- import data 
- split to train & test
- split to training features & target
- preprocess of train/test/evaluation data 

In [5]:
def splitXY(data):
    tempX = data.drop('exceeds50K', axis=1)
    tempY = data['exceeds50K']
    print(tempX.shape, tempY.shape)
    return tempX, tempY

trainingData = pd.read_csv(trainingDataFile)
evaluatingData = pd.read_csv(evaluateDataFile)


# spliting data
trainData, testData = train_test_split(trainingData, test_size=0.3, random_state=42)

# split to x y
XTrain, yTrain = splitXY(trainData)
XTest, yTest = splitXY(testData)


# preprocessing
XTrain = preprocessTrainData(XTrain)
print('XTrain', XTrain.shape)
XTest = preprocessTestData(XTest)
print('test', XTest.shape)
evaluateData = preprocessTestData(evaluatingData)
print('eval', evaluateData.shape)


(17094, 13) (17094,)
(7327, 13) (7327,)
XTrain (17094, 78)
test (7327, 78)
eval (24421, 78)


In [8]:
## Pandas profile report

In [11]:
from pandas_profiling import ProfileReport
profile = ProfileReport(trainingData, title="Pandas Profiling Report")
# profile = ProfileReport(trainingData, title="Pandas Profiling Report", explorative=True)
profile.to_widgets()
# profile.to_notebook_iframe()

Summarize dataset: 100%|██████████| 27/27 [00:28<00:00,  1.07s/it, Completed]
Generate report structure:   0%|          | 0/1 [00:12<?, ?it/s]


KeyboardInterrupt: 

In [10]:
profile.to_notebook_iframe()

## Model training


In [5]:
modelEvaluation = pd.DataFrame(columns=['F1 score', 'Accuracy'])
def evaluation(yTest, yPredice):
    f1 = round(f1_score(yTest, yPredict), 5)
    acc = round(accuracy_score(yTest, yPredict), 5)
    print(f'F1 score= {f1}  Accuracy=  {acc}')
    return [f1, acc]

# store model trained 
modelTrained = {}

In [6]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'GaussianNB'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.44507  Accuracy=  0.43674


In [7]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'DecisionTreeClassifier'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.6109  Accuracy=  0.81971


In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

model = make_pipeline(StandardScaler(), SVC(gamma='auto'))
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'make_pipeline(StandardScaler(), SVC(gamma=auto))'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model



model = make_pipeline(StandardScaler(), SVC(kernel='linear'))
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'make_pipeline(StandardScaler(), SVC(kernel=linear))'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.63522  Accuracy=  0.85014
F1 score= 0.65367  Accuracy=  0.85451


In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='none')
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'LogisticRegression(penalty=none)'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.65332  Accuracy=  0.85342


In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0)
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'RandomForestClassifier'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

model = RandomForestClassifier(random_state=30, n_estimators=120, min_samples_leaf=2, class_weight='balanced')
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'RandomForestClassifier(random_state=30, n_estimators=120, min_samples_leaf=2, class_weight=balanced)'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.64666  Accuracy=  0.84414
F1 score= 0.7022  Accuracy=  0.83759


In [11]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=0)
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'AdaBoostClassifier'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.66579  Accuracy=  0.86202


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=0)
model.fit(XTrain, yTrain)

yPredict = model.predict(XTest)

name = 'GradientBoostingClassifier'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.676  Accuracy=  0.86734


In [13]:
from xgboost import XGBClassifier

model = XGBClassifier(eta=0.2)
model.fit(XTrain, yTrain)
yPredict = model.predict(XTest)
name = 'XGBClassifier(eta=0.2)'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model


model = XGBClassifier(eta=0.1)
model.fit(XTrain, yTrain)
yPredict = model.predict(XTest)
name = 'XGBClassifier(eta=0.1)'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model


model = XGBClassifier(eta=0.3)
model.fit(XTrain, yTrain)
yPredict = model.predict(XTest)
name = 'XGBClassifier(eta=0.3)'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model


F1 score= 0.70517  Accuracy=  0.87321
F1 score= 0.70321  Accuracy=  0.87512
F1 score= 0.70481  Accuracy=  0.87184


In [14]:
model = XGBClassifier(eta=0.2, learning_rate=0.05, min_child_weight=3, max_depth=10)
model.fit(XTrain, yTrain)
yPredict = model.predict(XTest)

name = 'XGBClassifier(eta=0.2, learning_rate=0.05, min_child_weight=3, max_depth=10)'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = model

F1 score= 0.69755  Accuracy=  0.87348


In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tensor = keras.Sequential([
      layers.Dense(128, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),      
      layers.Dense(64, activation='relu'),      
      layers.Dense(64, activation='relu'),      
      layers.Dense(2)
  ])

tensor.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),    
                optimizer=tf.keras.optimizers.Adam(0.001))
history = tensor.fit(
    XTrain, yTrain,
    validation_split=0.2,
    verbose=0, epochs=150)

tensor.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               10112     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 130       
Total params: 30,978
Trainable params: 30,978
Non-trainable params: 0
____________________________________________________

In [18]:
yPredict = np.argmax(tensor.predict(XTest), axis=1)

name = 'Neural Network'
modelEvaluation.loc[name] = evaluation(yTest, yPredict)
modelTrained[name] = history

F1 score= 0.63767  Accuracy=  0.83513


In [1]:
modelEvaluation.sort_values('F1 score', ascending=False, inplace=True)
modelEvaluation

NameError: name 'modelEvaluation' is not defined

## Output file
get and use the best model

output as "{model name}_{f1 score}.csv"

In [21]:
# get model with highest f1 score
finalModel = modelTrained[modelEvaluation.iloc[0].name]

yEvaluate = finalModel.predict(evaluateData)
output = pd.DataFrame(data = yEvaluate.astype(int), columns=['prediction'])
output.index.name = "id"
output.index += 1 
output.to_csv(f'{modelEvaluation.iloc[0].name}_{modelEvaluation.iloc[0]["F1 score"]}.csv')
output.shape

(24421, 1)