In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

1. Loading the dataset
2. Exploring the data
3. Handling missing values
4. Removing duplicates
5. Converting data types and trimming whitespace
6. Encoding categorical variables
7. scaling numerical features

In [25]:
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

In [26]:
data = pd.read_csv(
    "adult.data",
    header=None,
    names=column_names
)

In [27]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [28]:
# Handling Missing values
# 1. Remove rows with missing values
data_clean = data.dropna()
data_clean.shape

(32561, 15)

In [29]:
# Alternative: impute missing values
# categorical: filling missing categorical values with mode
# numerical: filling missing numerical values with mean or median
data_clean.fillna({'native-country': data_clean['native-country'].mode()[0]}, inplace=True)

In [30]:
# Remove Duplication (if any)
data_clean = data_clean.drop_duplicates()

In [31]:
for col in data_clean.select_dtypes(include=['object']).columns:
    data_clean[col] = data_clean[col].str.strip().astype('category')

In [19]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             32537 non-null  int64   
 1   workclass       32537 non-null  category
 2   fnlwgt          32537 non-null  int64   
 3   education       32537 non-null  category
 4   education-num   32537 non-null  int64   
 5   marital-status  32537 non-null  category
 6   occupation      32537 non-null  category
 7   relationship    32537 non-null  category
 8   race            32537 non-null  category
 9   sex             32537 non-null  category
 10  capital-gain    32537 non-null  int64   
 11  capital-loss    32537 non-null  int64   
 12  hours-per-week  32537 non-null  int64   
 13  native-country  32537 non-null  category
 14  income          32537 non-null  category
dtypes: category(9), int64(6)
memory usage: 2.0 MB


In [32]:
data_clean['income'] = data_clean['income'].apply(lambda x: 1 if x == '>50K' else 0)
data_clean.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,0
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,1
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,1
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1


In [33]:
pd.get_dummies(data_clean, columns=['workclass'], drop_first=True)

Unnamed: 0,age,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,...,native-country,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,39,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,...,United-States,0,False,False,False,False,False,False,True,False
1,50,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,...,United-States,0,False,False,False,False,False,True,False,False
2,38,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,...,United-States,0,False,False,False,True,False,False,False,False
3,53,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,...,United-States,0,False,False,False,True,False,False,False,False
4,28,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,...,Cuba,0,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,...,United-States,0,False,False,False,True,False,False,False,False
32557,40,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,...,United-States,1,False,False,False,True,False,False,False,False
32558,58,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,...,United-States,0,False,False,False,True,False,False,False,False
32559,22,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,...,United-States,0,False,False,False,True,False,False,False,False


In [35]:
scaler = StandardScaler()
numerical_features = ['age', 'capital-gain']
scaler.fit(data_clean[numerical_features])
data_clean[numerical_features] = scaler.transform(data_clean[numerical_features])
data_clean.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.03039,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,0.148292,0,40,United-States,0
1,0.836973,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.145975,0,13,United-States,0
2,-0.042936,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.145975,0,40,United-States,0
3,1.05695,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.145975,0,40,United-States,0
4,-0.776193,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.145975,0,40,Cuba,0
