# Baseline Model for Census Income Dataset

### Part 1: Data Exploration

In [31]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, classification_report, recall_score, f1_score

# Load the CSV file into a DataFrame
file_path = os.path.join("..", "data", "income", "adult.data") # Replace with your actual file path
df = pd.read_csv(file_path)

# Define column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']

# Assign column names to the DataFrame
df.columns = column_names

# Display the first few rows of the DataFrame
#print(df.head(4))
print("\nNumber of Rows: ", df.shape[0])
print("Number of Features: ", df.shape[1])

# Divide the features into numerical and non-numerical lists
# Extract numerical and string features
num_features = df.select_dtypes(include=['number']).columns.tolist()
cat_features = df.select_dtypes(include=['object', 'string']).columns.tolist()

# Display the feature lists
print("Numerical Features:", num_features)
print("Categorical Features:", cat_features)


Number of Rows:  32560
Number of Features:  15
Numerical Features: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical Features: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


In [32]:
# Convert income column to binary, flag errors
def convert_income(value):
    value = str(value).strip()
    if value == '>50K':
        return 1
    elif value == '<=50K':
        return 0
    else:
        return np.nan  # Flag invalid values as NaN (or set a custom error flag)

df['income'] = df['income'].apply(convert_income)

# Identify and display rows with errors
error_rows = df[df['income'].isna()]
if not error_rows.empty:
    print("Invalid income values found in ", error_rows.size, "rows: ")
    print(error_rows)


df.head(4)
cat_features.remove('income')

### Part 2: Baseline Model

Define Helper Methods

In [33]:
def eval_performance(y_test, y_pred):
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Display metrics
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

Train and Test Baseline Model

In [34]:
# Separate features and target variable
X = df[num_features + cat_features]
y = df['income']

# Preprocessing: Standardize numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# Create a pipeline with logistic regression
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance of baseline model
eval_performance(y_test, y_pred)


Accuracy: 0.8621
Precision: 0.7642
Recall: 0.6180
F1 Score: 0.6834

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91      4944
           1       0.76      0.62      0.68      1568

    accuracy                           0.86      6512
   macro avg       0.82      0.78      0.80      6512
weighted avg       0.86      0.86      0.86      6512



### Part 3: Characterizing Fairness

#### Individual Fairness: 

Ensures that similar individuals receive similar predictions. A common way to assess this is by checking prediction consistency across similar instances. We can do this using a Nearest Neighbors Consistency Test.

In [None]:
from sklearn.neighbors import NearestNeighbors
import gower

# ---  HEADS UP!! This worked, but took 8 mins last time ---
# I created a flag to stop it from always running.
# --- RESULTS ----
# Baseline model: 0.8276 with k = 5

ind_fairness_flag = False

if ind_fairness_flag == True:
    # Scale numerical features & One-Hot Encode categorical features
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
    ])

    # Transform training and test features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Compute Gower distance matrix for test samples w.r.t training data
    gower_distances = gower.gower_matrix(X_test, X_train)  # Shape: (num_test_samples, num_train_samples)

    # Find k nearest neighbors (excluding self)
    k = 5  # Adjust as needed
    neighbors = np.argsort(gower_distances, axis=1)[:, 1:k+1]  # Get indices of k nearest neighbors

    # Get model predictions
    y_test_pred = model.predict(X_test)

    # Compute consistency score: Fraction of nearest neighbors with same prediction
    consistencies = []
    for i, neigh_indices in enumerate(neighbors):
        neighbor_preds = y_train.iloc[neigh_indices]  # Get predictions of k neighbors from training labels
        consistency = np.mean(neighbor_preds == y_test_pred[i])  # Fraction with same prediction
        consistencies.append(consistency)

    # Calculate overall consistency score
    individual_fairness_score = np.mean(consistencies)

    print(f'Individual Fairness Consistency Score (with categorical features): {individual_fairness_score:.4f}')

Individual Fairness Consistency Score (with categorical features): 0.8276
