In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']

In [3]:
# Load the data into a pandas dataframe
data = pd.read_csv(url, header=None, names=columns, na_values=' ?', skipinitialspace=True)
print("Dataset Loaded:")
print(data.head())

Dataset Loaded:
   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country income  
0          2174             0              40  United-States  <=50K  
1             0           

In [4]:
# Checking for missing values
print("Checking for missing values:")
print(data.isnull().sum())

Checking for missing values:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [5]:
# Dropping rows with missing values
data_cleaned = data.dropna()
print("Missing values after dropping rows:")
print(data_cleaned.isnull().sum())

Missing values after dropping rows:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [6]:
# Converting categorical variables into dummy/indicator variables
data_cleaned = pd.get_dummies(data_cleaned)
print("Dataset after converting categorical variables:")
print(data_cleaned.head())

Dataset after converting categorical variables:
   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0        False                  False                False   
1        False                  False                False   
2        False                  False                False   
3        False                  False                False   
4        False                  False                False   

   workclass_Never-worked  ...  native_country_Scotland  native_country_South  \
0                   False  ...     

In [7]:
# Normalizing numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
data_cleaned[numerical_features] = scaler.fit_transform(data_cleaned[numerical_features])
print("Dataset after normalizing numerical features:")
print(data_cleaned.head())

Dataset after normalizing numerical features:
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   

   hours_per_week  workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0       -0.035429        False                  False                False   
1       -2.222153        False                  False                False   
2       -0.035429        False                  False                False   
3       -0.035429        False                  False                False   
4       -0.035429        False                  False                False   

   workclass_Never-worked  ...  native_country_Scotland  native_country_Sout

In [8]:
# Save and export the cleaned dataset
data_cleaned.to_csv('cleaned_adult_data.csv', index=False)
print("Cleaned dataset saved as 'cleaned_adult_data.csv'")

Cleaned dataset saved as 'cleaned_adult_data.csv'
