In [13]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import altair as alt
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
data_adult = pd.read_csv("data/adult.data", names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])

data_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# EDA Analysis
## Data Summary

Our dataset contains 15 different columns with numerical and categorical variables

In [7]:
data_adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
data_adult.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [24]:
alt.Chart(data_adult, title='Income for Different Martial Status').mark_bar(opacity=0.75).encode(        
    alt.Y('marital-status').title('Martial Status'),             
    alt.X('count()').scale(zero=False).stack(False),
    alt.Color('income')                              
).properties(
    height=200,
    width=300
)

In [25]:
alt.Chart(data_adult, title='Income for Different Relationship').mark_bar(opacity=0.75).encode(        
    alt.Y('relationship').title('Relationship'),             
    alt.X('count()').scale(zero=False).stack(False),
    alt.Color('income')                              
).properties(
    height=200,
    width=300
)

In [26]:
alt.Chart(data_adult, title='Income for Different Occupations').mark_bar(opacity=0.75).encode(        
    alt.Y('occupation').title('Occupation'),             
    alt.X('count()').scale(zero=False).stack(False),
    alt.Color('income')                              
).properties(
    height=200,
    width=300
)

In [27]:
alt.Chart(data_adult, title='Income for Different Workclass').mark_bar(opacity=0.75).encode(        
    alt.Y('workclass').title('Workclass'),             
    alt.X('count()').scale(zero=False).stack(False),
    alt.Color('income')                              
).properties(
    height=200,
    width=300
)

In [28]:
alt.Chart(data_adult, title='Income for Different Race').mark_bar(opacity=0.75).encode(        
    alt.Y('race').title('Race'),             
    alt.X('count()').scale(zero=False).stack(False),
    alt.Color('income')                              
).properties(
    height=200,
    width=300
)

In [29]:
alt.Chart(data_adult, title='Income for Different Sex').mark_bar(opacity=0.75).encode(        
    alt.Y('sex').title('Sex'),             
    alt.X('count()').scale(zero=False).stack(False),
    alt.Color('income')                              
).properties(
    height=200,
    width=300
)

In [3]:
train_df, test_df = train_test_split(data_adult, test_size=0.20, random_state=123)
X_train, y_train = (
    train_df.drop(columns=['income']),
    train_df["income"],
)
X_test, y_test = (
    test_df.drop(columns=['income']),
    test_df["income"],
)

In [4]:
X_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
20713,55,State-gov,199713,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,15,United-States
13495,65,Private,115890,Bachelors,13,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,20,United-States
12367,29,Private,145592,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Guatemala
22402,53,State-gov,231472,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States
18338,32,Private,107218,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Asian-Pac-Islander,Male,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,50,Federal-gov,251585,Bachelors,13,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,55,United-States
30311,54,Private,68684,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,44,United-States
24672,28,Private,293926,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,1740,30,United-States
24229,45,Self-emp-not-inc,176814,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024,0,40,United-States


In [10]:
categorical_features = ["marital-status", "relationship", "occupation", "workclass", "race"]
binary_features = ["sex"]
drop_features = ["age", "fnlwgt", "education", "education-num", "capital-gain", "capital-loss", "hours-per-week", "native-country"]

binary_transformer = OneHotEncoder(drop="if_binary", dtype=int)
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
)

preprocessor = make_column_transformer(   
    (binary_transformer, binary_features),    
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

model = KNeighborsClassifier()
pipe = make_pipeline(preprocessor, model)
pipe.fit(X_train, y_train)
test_score = pipe.score(X_test, y_test)

In [13]:
test_score

0.8079226163058498

In [14]:
pipe.predict(X_test)

array([' >50K', ' <=50K', ' <=50K', ..., ' <=50K', ' >50K', ' >50K'],
      dtype=object)