In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron


import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
# first try
# use encoder to convert categorical data to numerical data
# then use decision tree to train and predict

# second try
# limit the depth of the tree [9]

# third try
# use ada boost to train and predict: submission 3 (decrease)

# fourth try
# use bagging to train and predict: submission 4 (decrease)

# fifth try
# use random forest to train and predict: submission 5 (decrease)

# sixth try
# use perceptron to train and predict: submission 6 (decrease)



In [None]:
# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/train_final.csv')
test = pd.read_csv('../data/income2023f/test_final.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
y_train = train['income>50K']

X_test = test.drop('ID', axis=1)

for column in X_train.columns:
    if '?' in X_train[column].values and X_train[column].dtype == type(object):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif '?' in X_train[column].values:
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_test.columns:
    if '?' in X_test[column].values and X_test[column].dtype == type(object):
        X_test[column] = X_test[column].replace('?', X_test[column].mode()[0])
    elif '?' in X_test[column].values:
        X_test[column] = X_test[column].replace('?', X_test[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        X_test[column] = encoder.fit_transform(X_test[column])


# Create the perceptron classifier
clf = Perceptron(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the train data
y_pred = clf.predict(X_train)

# Calculate the accuracy
print("Train Accuracy:", accuracy_score(y_train, y_pred))

# Make predictions on the test data
y_test_pred = clf.predict(X_test)

# Create a submission file
submission = pd.DataFrame({'ID': test['ID'], 'Prediction': y_test_pred})
submission.to_csv('../data/income2023f/submission_perceptron.csv', index=False)




In [None]:
# DEPTH LIMITING AND HANDLE MISSING DATA

# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/train_final.csv')
# test = pd.read_csv('../data/income2023f/test_final.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
y_train = train['income>50K']

# X_test = test.drop('ID', axis=1)

# handle missing data
# X_train = X_train.fillna(X_train.mean())
for column in X_train.columns:
    if '?' in X_train[column].values and( X_train[column].dtype == type(object)):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif '?' in X_train[column].values:
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        X_test[column] = encoder.fit_transform(X_test[column])

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Create the classifier
dt = DecisionTreeClassifier(max_depth=9, random_state=42)

# Train the classifier
dt.fit(X_train, y_train)

# Make predictions on the train data
y_pred = dt.predict(X_train)

y_test_pred = dt.predict(X_test)

# Calculate the accuracy
print("Train Accuracy", accuracy_score(y_train, y_pred))
print("Test Accuracy", accuracy_score(y_test, y_test_pred))

# Make predictions on the test data
# y__test_pred = dt.predict(X_test)

In [58]:
# ADABOOST
import numpy as np

# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/train_final.csv')
test = pd.read_csv('../data/income2023f/test_final.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
y_train = train['income>50K']
y_train = y_train.replace(0, -1)

X_test_f = test.drop('ID', axis=1)

# handle missing data
# X_train = X_train.fillna(X_train.mean())
for column in X_train.columns:
    if '?' in X_train[column].values and( X_train[column].dtype == type(object)):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif '?' in X_train[column].values:
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        # X_test[column] = encoder.fit_transform(X_test[column])
        X_test_f[column] = encoder.fit_transform(X_test_f[column])

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.45, random_state=42)

# Create the classifier
dt = AdaBoostClassifier(n_estimators=500, random_state=42)

# Train the classifier
model = dt.fit(X_train, y_train)


# Make predictions on the train data
y_pred = dt.predict(X_train)

y_test_pred = dt.predict(X_test)

y_test_pred_f = dt.predict(X_test_f)
# replace -1 with 0 in the array
y_test_pred = pd.DataFrame(y_test_pred)
y_test_pred = y_test_pred.replace(-1, 0)

# Calculate the accuracy
print("Train Accuracy", accuracy_score(y_train, y_pred))
print("Test Accuracy", accuracy_score(y_test, y_test_pred))

# replace -1 with 0 in the array
# y_test_pred = pd.DataFrame(y_test_pred)
# y_test_pred = y_test_pred.replace(-1, 0)


submission = pd.DataFrame({'ID': test['ID'], 'Prediction': y_test_pred_f})
submission.to_csv('../data/income2023f/adaboost.csv', index=False)





Train Accuracy 0.8761454545454546
Test Accuracy 0.1552888888888889


In [None]:
# SVM

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt


import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/adult.data.csv')
test = pd.read_csv('../data/income2023f/adult.test.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
X_test = test.drop('income>50K', axis=1)
#X_train with only columns 'marital-status', 'education-num', relationship, age
# X_train = X_train[['marital.status', 'education.num', 'relationship', 'age']]

y_train = train['income>50K']
y_train = y_train.replace(' >50K', 1)
y_train = y_train.replace(' <=50K', 0)
# y_train = y_train.replace(0, -1)
y_test = test['income>50K']
y_test = y_test.replace(' >50K.', 1)    
y_test = y_test.replace(' <=50K.', 0)

# X_test_f = test.drop('ID', axis=1)
# X_test_f = X_test_f[['marital.status', 'education.num', 'relationship', 'age']]


# handle missing data
# X_train = X_train.fillna(X_train.mean())
for column in X_train.columns:
    if '?' in X_train[column].values and( X_train[column].dtype == type(object)):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif '?' in X_train[column].values:
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        X_test[column] = encoder.fit_transform(X_test[column])
        # X_test_f[column] = encoder.fit_transform(X_test_f[column])

svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

# y_pred = svclassifier.predict(X_test)
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred))



In [None]:
# BAGGING

# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/train_final.csv')
test = pd.read_csv('../data/income2023f/test_final.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
y_train = train['income>50K']

y_train = y_train.replace(0, -1)

X_test_f = test.drop('ID', axis=1)

# handle missing data
# X_train = X_train.fillna(X_train.mean())
for column in X_train.columns:
    if (X_train[column].isin(['?']).any() and X_train[column].dtype == type(object)):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif (X_train[column].isin(['?']).any()):
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        X_test[column] = encoder.fit_transform(X_test[column])
        X_test_f[column] = encoder.fit_transform(X_test_f[column])

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.45, random_state=42)

# Create the base estimator
base_estimator = DecisionTreeClassifier()

# Create the bagging classifier
bagging = BaggingClassifier(base_estimator=base_estimator, n_estimators=500, random_state=42)


# Train the classifier
bagging.fit(X_train, y_train)

# Make predictions on the train data
y_pred = bagging.predict(X_train)

# Make predictions on the test data
y_test_pred = bagging.predict(X_test)

y_test_pred_f = bagging.predict(X_test_f)

# Calculate the accuracy
print("Train Accuracy", accuracy_score(y_train, y_pred))
print("Test Accuracy", accuracy_score(y_test, y_test_pred))

y_test_pred_f = pd.DataFrame(y_test_pred_f)
y_test_pred_f = y_test_pred_f.replace(-1, 0)

# Create a submission file
submission = pd.DataFrame({'ID': test['ID'], 'Prediction': y_test_pred_f.values.flatten()})
submission.to_csv('../data/income2023f/submission_bagging.csv', index=False)


In [None]:
# RANDOM FOREST

# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/train_final.csv')
# test = pd.read_csv('../data/income2023f/test_final.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
y_train = train['income>50K']

# X_test = test.drop('ID', axis=1)

# handle missing data
# X_train = X_train.fillna(X_train.mean())
for column in X_train.columns:
    if '?' in X_train[column].values and( X_train[column].dtype == type(object)):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif '?' in X_train[column].values:
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        # X_test[column] = encoder.fit_transform(X_test[column])

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.5, random_state=42)


# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800 ,900, 1000],
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the random forest classifier
rf = RandomForestClassifier(random_state=42, )

# Create the randomized search object
random_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Make predictions on the train data
y_pred = random_search.predict(X_train)

# Make predictions on the test data
y_test_pred = random_search.predict(X_test)

# Calculate the accuracy
print("Train Accuracy", accuracy_score(y_train, y_pred))
print("Test Accuracy", accuracy_score(y_test, y_test_pred))



In [None]:
# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/train_final.csv')
test = pd.read_csv('../data/income2023f/test_final.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
y_train = train['income>50K']

X_test = test.drop('ID', axis=1)

for column in X_train.columns:
    if '?' in X_train[column].values and X_train[column].dtype == type(object):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif '?' in X_train[column].values:
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_test.columns:
    if '?' in X_test[column].values and X_test[column].dtype == type(object):
        X_test[column] = X_test[column].replace('?', X_test[column].mode()[0])
    elif '?' in X_test[column].values:
        X_test[column] = X_test[column].replace('?', X_test[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        X_test[column] = encoder.fit_transform(X_test[column])

# Create the bagging classifier
rf = RandomForestClassifier(n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', max_depth=50, random_state=42)

# Train the classifier
rf.fit(X_train, y_train)

# Make predictions on the train data
y_pred = rf.predict(X_train)

# Calculate the accuracy
accuracy_score(y_train, y_pred)

# Make predictions on the test data
y__test_pred = rf.predict(X_test)

#print accuracy
print("Train Accuracy", accuracy_score(y_train, y_pred))



# Create a submission file
submission = pd.DataFrame({'ID': test['ID'], 'Prediction': y__test_pred})
submission.to_csv('../data/income2023f/submission5.csv', index=False)





In [None]:
from sklearn.tree import export_graphviz
# from sklearn.externals.six import StringIO
from six import StringIO
from IPython.display import Image  
import pydotplus

In [None]:


dot_data = StringIO()
export_graphviz(dt, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = X_train.columns, class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('Income>50K.png')
Image(graph.create_png())


In [69]:
# ADABOOST REMOVING FEATURE

# Load the data from the CSV file
train = pd.read_csv('../data/income2023f/train_final.csv')
test = pd.read_csv('../data/income2023f/test_final.csv')

# Separate the features from the labels                         
X_train = train.drop('income>50K', axis=1)
#X_train with only columns 'marital-status', 'education-num', relationship, age
# X_train = X_train[['marital.status', 'education.num', 'relationship', 'age']]

y_train = train['income>50K']
y_train = y_train.replace(0, -1)

X_test_f = test.drop('ID', axis=1)
# X_test_f = X_test_f[['marital.status', 'education.num', 'relationship', 'age']]

X_train = X_train[['age', 'education.num', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week']]
X_test_f = X_test_f[['age', 'education.num', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week']]


# handle missing data
# X_train = X_train.fillna(X_train.mean())
for column in X_train.columns:
    if '?' in X_train[column].values and( X_train[column].dtype == type(object)):
        X_train[column] = X_train[column].replace('?', X_train[column].mode()[0])
    elif '?' in X_train[column].values:
        X_train[column] = X_train[column].replace('?', X_train[column].median())

for column in X_train.columns:
    if X_train[column].dtype == type(object):
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        # X_test[column] = encoder.fit_transform(X_test[column])
        X_test_f[column] = encoder.fit_transform(X_test_f[column])

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.45, random_state=42)

# Create the classifier
dt = AdaBoostClassifier(n_estimators=500, random_state=42)

# Train the classifier
model = dt.fit(X_train, y_train)


# Make predictions on the train data
y_pred = dt.predict(X_train)

y_test_pred = dt.predict(X_test)

y_test_pred_f = dt.predict(X_test_f)

# Calculate the accuracy
print("Train Accuracy", accuracy_score(y_train, y_pred))
print("Test Accuracy", accuracy_score(y_test, y_test_pred))

# replace -1 with 0 in the array
y_test_pred_f = np.where(y_test_pred_f == -1, 0, y_test_pred_f)


submission = pd.DataFrame({'ID': test['ID'], 'Prediction': y_test_pred_f})
submission.to_csv('../data/income2023f/adaboost_1.csv', index=False)





Train Accuracy 0.8591272727272727
Test Accuracy 0.8497777777777777
