In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [32]:
df = pd.read_csv("adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [34]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education.num,0
marital.status,0
occupation,0
relationship,0
race,0
sex,0


In [35]:
missing_token = ['?', ' ?', 'NA', 'N/A', 'null', 'None', 'unknown', '-',]

for col in  df.select_dtypes(include='object'):
  print(col)
  print(df[col].isin(missing_token).sum())

workclass
1836
education
0
marital.status
0
occupation
1843
relationship
0
race
0
sex
0
native.country
583
income
0


In [36]:
df.replace(missing_token,np.nan,inplace=True)

In [37]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,1836
fnlwgt,0
education,0
education.num,0
marital.status,0
occupation,1843
relationship,0
race,0
sex,0


In [38]:
for col in ["workclass", "occupation", "native.country"]:
    mode_value = df[col].mode()
    if not mode_value.empty:
        df[col] = df[col].fillna(mode_value.iloc[0])


In [39]:
df.isnull().sum()


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education.num,0
marital.status,0
occupation,0
relationship,0
race,0
sex,0


In [43]:
X = df.drop("income", axis=1)
y = df["income"]
le = LabelEncoder()
y = le.fit_transform(y)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [45]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns


In [48]:
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [49]:
X_train.head()
X_test.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
10489,-0.772657,Self-emp-not-inc,0.138566,Bachelors,1.132737,Never-married,Tech-support,Not-in-family,White,Male,-0.146613,-0.217898,0.371553,United-States
25652,-1.433042,Private,-0.678059,HS-grad,-0.419226,Never-married,Handlers-cleaners,Own-child,White,Male,-0.146613,-0.217898,-1.981349,United-States
12243,0.621488,Private,0.205797,Bachelors,1.132737,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.146613,-0.217898,0.371553,United-States
25487,1.135121,State-gov,-0.501324,Doctorate,2.296709,Never-married,Exec-managerial,Not-in-family,White,Female,-0.146613,-0.217898,-0.03412,United-States
5091,0.621488,Private,1.372661,HS-grad,-0.419226,Divorced,Adm-clerical,Unmarried,White,Female,-0.146613,-0.217898,-0.03412,United-States


In [50]:
df.to_csv("adult_cleaned.csv", index=False)
