# Baseline Model for Census Income Dataset

### Part 1: Data Exploration

In [18]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, classification_report, recall_score, f1_score

# Load the CSV file into a DataFrame
file_path = os.path.join("..", "data", "income", "adult.data") # Replace with your actual file path
df = pd.read_csv(file_path)

# Define column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']

# Assign column names to the DataFrame
df.columns = column_names

# Divide the features into numerical and non-numerical lists
# Extract numerical and string features
num_features = df.select_dtypes(include=['number']).columns.tolist()
cat_features = df.select_dtypes(include=['object', 'string']).columns.tolist()

In [19]:
# Convert income column to binary, flag errors
def convert_income(value):
    value = str(value).strip()
    if value == '>50K':
        return 1
    elif value == '<=50K':
        return 0
    else:
        return np.nan  # Flag invalid values as NaN (or set a custom error flag)

df['income'] = df['income'].apply(convert_income)

# Encode the privileged/underprivileged
def encode_sex(value):
    value = str(value).strip()
    if value == "Male":
        return 1
    else:
        return 0  

df['sex'] = df['sex'].apply(encode_sex)

# Identify and display rows with errors
error_rows = df[df['income'].isna()]
if not error_rows.empty:
    print("Invalid income values found in ", error_rows.size, "rows: ")
    print(error_rows)


print(df.head(4))
cat_features.remove('income')

   age          workclass  fnlwgt   education  education-num  \
0   50   Self-emp-not-inc   83311   Bachelors             13   
1   38            Private  215646     HS-grad              9   
2   53            Private  234721        11th              7   
3   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race  sex  \
0   Married-civ-spouse     Exec-managerial         Husband   White    1   
1             Divorced   Handlers-cleaners   Not-in-family   White    1   
2   Married-civ-spouse   Handlers-cleaners         Husband   Black    1   
3   Married-civ-spouse      Prof-specialty            Wife   Black    0   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0             0             0              13   United-States       0  
1             0             0              40   United-States       0  
2             0             0              40   United-States       0  
3             0

### Part 2: Baseline Model

Define Helper Methods

In [20]:
def eval_performance(y_test, y_pred):
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Display metrics
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

Train and Test Baseline Model

In [21]:
# # Separate features and target variable
# X = df[num_features + cat_features]
# y = df['income']

# # Preprocessing: Standardize numerical features and one-hot encode categorical features
# preprocessor = ColumnTransformer([
#     #('num', StandardScaler(), num_features),
#     ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
# ])

# # Create a pipeline with logistic regression
# model = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Train the model
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)

# # Evaluate performance of baseline model
# eval_performance(y_test, y_pred)


### Part 3: Characterizing Fairness

#### Individual Fairness: 

Ensures that similar individuals receive similar predictions. A common way to assess this is by checking prediction consistency across similar instances. We can do this using a Nearest Neighbors Consistency Test.

In [22]:
from sklearn.neighbors import NearestNeighbors
import gower

# ---  HEADS UP!! This worked, but took 8 mins last time ---
# I created a flag to stop it from always running.
# --- RESULTS ----
# Baseline model: 0.8276 with k = 5

ind_fairness_flag = False

if ind_fairness_flag == True:
    # Scale numerical features & One-Hot Encode categorical features
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
    ])

    # Transform training and test features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Compute Gower distance matrix for test samples w.r.t training data
    gower_distances = gower.gower_matrix(X_test, X_train)  # Shape: (num_test_samples, num_train_samples)

    # Find k nearest neighbors (excluding self)
    k = 5  # Adjust as needed
    neighbors = np.argsort(gower_distances, axis=1)[:, 1:k+1]  # Get indices of k nearest neighbors

    # Get model predictions
    y_test_pred = model.predict(X_test)

    # Compute consistency score: Fraction of nearest neighbors with same prediction
    consistencies = []
    for i, neigh_indices in enumerate(neighbors):
        neighbor_preds = y_train.iloc[neigh_indices]  # Get predictions of k neighbors from training labels
        consistency = np.mean(neighbor_preds == y_test_pred[i])  # Fraction with same prediction
        consistencies.append(consistency)

    # Calculate overall consistency score
    individual_fairness_score = np.mean(consistencies)

    print(f'Individual Fairness Consistency Score (with categorical features): {individual_fairness_score:.4f}')

## Masaging to Remove Bias

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

def compute_discrimination(df, sensitive_attr, class_attr, privileged_value, positive_class):
    # Compute the discrimination score (difference in positive outcome rates).
    privileged = df[df[sensitive_attr] == privileged_value]
    unprivileged = df[df[sensitive_attr] != privileged_value]

    pos_rate_privileged = sum(privileged[class_attr] == positive_class) / len(privileged)
    pos_rate_unprivileged = sum(unprivileged[class_attr] == positive_class) / len(unprivileged)

    return pos_rate_unprivileged - pos_rate_privileged

def compute_m(df, sensitive_attr, class_attr, privileged_value, positive_class):
    # Compute number of instances M to relabel.
    disc = compute_discrimination(df, sensitive_attr, class_attr, privileged_value, positive_class)
    
    n_privileged = len(df[df[sensitive_attr] == privileged_value])
    n_unprivileged = len(df[df[sensitive_attr] != privileged_value])
    
    return int(abs(disc) * (n_privileged * n_unprivileged) / len(df))

def rank_instances(df, features, sensitive_attr, class_attr):
    # Train a classifier to rank instances by likelihood of being positive.
    X = df[features]
    y = df[class_attr]

    model = DecisionTreeClassifier()
    model.fit(X, y)
    
    scores = model.predict_proba(X)[:, 1]  # Probability of positive class
    df['score'] = scores
    return df

def apply_massaging(df, sensitive_attr, class_attr, privileged_value, positive_class):
    # Perform massaging technique.
    # Step 1: Compute M
    M = compute_m(df, sensitive_attr, class_attr, privileged_value, positive_class)
    print(f"Number of label changes (M): {M}")

    if M == 0:
        print("No massaging needed.")
        return df

    # Step 2: Rank instances
    features = [col for col in df.columns if col not in [sensitive_attr, class_attr]]
    df = rank_instances(df, features, sensitive_attr, class_attr)

    # Step 3: Modify labels
    unprivileged_neg = df[(df[sensitive_attr] != privileged_value) & (df[class_attr] != positive_class)]
    privileged_pos = df[(df[sensitive_attr] == privileged_value) & (df[class_attr] == positive_class)]

    # Promote top M from unprivileged_neg
    df.loc[unprivileged_neg.nlargest(M, 'score').index, class_attr] = positive_class

    # Demote bottom M from privileged_pos
    df.loc[privileged_pos.nsmallest(M, 'score').index, class_attr] = 1 - positive_class

    # Drop the ranking column
    df.drop(columns=['score'], inplace=True)

    return df


In [25]:
print("Before:")
print(df)

# Apply massaging technique
df_massaged = apply_massaging(df, 'sex', 'income', privileged_value=1, positive_class=1)
print("After:")
print(df_massaged)

X = df[num_features + cat_features]
y = df['income']

# Preprocessing: Standardize numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer([
    #('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# Create a pipeline with logistic regression
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance of baseline model
eval_performance(y_test, y_pred)

Before:
       age          workclass  fnlwgt    education  education-num  \
0       50   Self-emp-not-inc   83311    Bachelors             13   
1       38            Private  215646      HS-grad              9   
2       53            Private  234721         11th              7   
3       28            Private  338409    Bachelors             13   
4       37            Private  284582      Masters             14   
...    ...                ...     ...          ...            ...   
32555   27            Private  257302   Assoc-acdm             12   
32556   40            Private  154374      HS-grad              9   
32557   58            Private  151910      HS-grad              9   
32558   22            Private  201490      HS-grad              9   
32559   52       Self-emp-inc  287927      HS-grad              9   

            marital-status          occupation    relationship    race  sex  \
0       Married-civ-spouse     Exec-managerial         Husband   White    1   
1    

ValueError: could not convert string to float: ' Self-emp-not-inc'