In [12]:
import pandas as pd

# Replace 'dataset_url' with the URL or file path of the dataset
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
                "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
df = pd.read_csv(dataset_url, names=column_names, header=None, na_values=["?"])


In [13]:
# View first few rows
print(df.head())

# Check data types and missing values
print(df.info())

# Summary statistics
print(df.describe(include="all"))


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [14]:
print(df.isnull().sum())


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [15]:
# Drop rows with missing values
df.dropna(inplace=True)

# Alternatively, fill missing values with mode (for categorical data)
df["workclass"].fillna(df["workclass"].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["workclass"].fillna(df["workclass"].mode()[0], inplace=True)


In [16]:
# Check for duplicates
print(df.duplicated().sum())

# Drop duplicates
df.drop_duplicates(inplace=True)


24


In [17]:
df["workclass"] = df["workclass"].str.strip().str.capitalize()


In [18]:
# One-hot encoding
df = pd.get_dummies(df, columns=["workclass", "sex"], drop_first=True)

# Label encoding (for binary classification)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["income"] = le.fit_transform(df["income"])


In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[["age", "capital-gain", "capital-loss", "hours-per-week"]] = scaler.fit_transform(
    df[["age", "capital-gain", "capital-loss", "hours-per-week"]])


In [20]:
df.to_csv("cleaned_dataset.csv", index=False)
