# Imports


In [17]:
import warnings
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
warnings.simplefilter(action="ignore")

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  

# wrangle function


In [6]:
def wrangle(source):
    df = source
    
    # drop duplicates
    df.drop_duplicates(inplace=True)
    
    # drop all field with "?" to nan
    df.replace("?", np.nan, inplace=True)
    
    #Drop all rows with null values
    df.dropna(inplace=True)
    
    #Drop all columns not needed for analysis
    df.drop(columns=["fnlwgt", "education-num", "capital-gain", "capital-loss"], inplace=True)
    
    # limit age to 80 years
    df = df[df["age"] <= 80]
    
    # standardize the target column
    df["income"] = df["income"].str.replace(".","", regex=False)
    
    # turn target column to binary column
    income_map = {"<=50K":1, ">50K":0}
    df["income"] = df["income"].map(income_map)
    
    return df

In [7]:
df = wrangle(adult.data.original)

In [8]:
df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,1
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,1
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,1
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,1
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,1


#  Split


In [10]:
#split target and features 
target = "income"

X = df.drop(columns=target)
y = df[target]

In [12]:
# Split train and test set
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [14]:
# Split train and val set
X_train,X_val,y_train,y_val = train_test_split(X_train ,y_train , test_size=0.2, random_state=42)

In [16]:
print(f"""
X_train: {X_train.shape}, y_train: {y_train.shape}
X_test: {X_test.shape}, y_test: {y_test.shape}
X_val: {X_val.shape}, y_val: {y_val.shape}
""")


X_train: (28851, 10), y_train: (28851,)
X_test: (9017, 10), y_test: (9017,)
X_val: (7213, 10), y_val: (7213,)



# Baseline

In [22]:
#calculate y_train value counts
baseline_acc = y_train.value_counts(normalize = True).max()
baseline_acc