In [1]:
#import required libraries
import pandas as pd

In [2]:
# step1: Loading the dataset
data = pd.read_csv('adult.csv', na_values='?')

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
#Handling missing values
# The 'adult' dataset uses '?' to represent missing values. We'll replace them with NaN.
data.replace("?",pd.NA,inplace=True)

In [5]:
#step 2: Now, let's handle missing values by either imputation
from sklearn.impute import SimpleImputer

In [6]:
# Impute missing values in numerical columns with mean and categorical columns with the most frequent value.
num_imputer = SimpleImputer(strategy = 'mean')
cat_imputer = SimpleImputer(strategy = 'most_frequent')                           

In [7]:
num_cols = data.select_dtypes(include='number').columns
cat_cols = data.select_dtypes(include='object').columns

In [8]:
data[num_cols]=num_imputer.fit_transform(data[num_cols])
data[cat_cols]=cat_imputer.fit_transform(data[cat_cols])

In [9]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,Private,103497.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [10]:
# Step 3: Encoding categorical variables
# We'll use one-hot encoding for categorical variables.
data = pd.get_dummies(data, drop_first=True)

In [11]:
data.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income_>50K
0,25.0,226802.0,7.0,0.0,0.0,40.0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1,38.0,89814.0,9.0,0.0,0.0,50.0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
2,28.0,336951.0,12.0,0.0,0.0,40.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,44.0,160323.0,10.0,7688.0,0.0,40.0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
4,18.0,103497.0,10.0,0.0,0.0,30.0,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False


In [12]:
# Step 4: Splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split

In [13]:
X = data.drop('income_>50K', axis=1)  # Features
y = data['income_>50K']  # Target variable

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Step 5: Scaling features (if needed)
# Depending on the algorithm you're using, scaling may or may not be necessary.
# For many algorithms, it's beneficial to scale features to a similar range.
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)