# Week 4

## Dataset Selection

I chose the **Adult** dataset from UCI since it contains a good mix of categorical and continuous features. I'm downloading from an alternate site for two reasons.

1. The UCI site has an invalid certificate at this time.
2. The alternate site provides the CSV file directly instead of a Zip file.

The format of the CSV file from the alternate site is slightly different from that from UCI but the data are identical.

In [None]:
import os
from urllib.request import urlretrieve
#url = 'https://archive.ics.uci.edu/static/public/2/adult.zip'
url = 'https://huggingface.co/datasets/scikit-learn/adult-census-income/resolve/main/adult.csv'
file_name = os.path.basename(url)
urlretrieve(url, file_name)

## Data Cleaning & Preprocessing

I handle missing values by imputing categorical values with the most frequent value and imputing continuous values by using the median value. I use one-hot encoding for the categorical values since there aren't very many of them. I use the standard scaler for the continuous values.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Load dataset
df = pd.read_csv(file_name)

# Separate features and target
X = df.drop("income", axis=1)
y = df["income"]

# Identify categorical and continuous features
categorical_features = ["workclass","education","marital.status","occupation",
                        "relationship","race","sex","native.country"]
continuous_features = ["age","fnlwgt","education.num","capital.gain",
                       "capital.loss","hours.per.week"]

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Preprocessing for continuous data
continuous_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("continuous", continuous_transformer, continuous_features)
    ]
)

# Build full pipeline with a model
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
clf.fit(X_train, y_train)

print("Model accuracy:", clf.score(X_test, y_test))

In [None]:
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline

# Target encoding only for native-country
categorical_features = ["workclass","education","marital.status","occupation",
                        "relationship","race","sex"]
high_cardinality_feature = ["native.country"]

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

target_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("target_encoder", TargetEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("target", target_transformer, high_cardinality_feature),
        ("continuous", continuous_transformer, continuous_features)
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Fit model
clf.fit(X_train, y_train)

print("Model accuracy:", clf.score(X_test, y_test))