In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "income"
]


In [None]:
train_df = pd.read_csv("/content/adult.data", names=columns, sep=",", skipinitialspace=True, na_values="?")
test_df = pd.read_csv("/content/adult.test", names=columns, sep=",", skipinitialspace=True, skiprows=1, na_values="?")


In [None]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [None]:
train_df["income"] = train_df["income"].apply(lambda x: 1 if ">50K" in x else 0)
test_df["income"] = test_df["income"].apply(lambda x: 1 if ">50K" in x else 0)

In [None]:
sex_train = train_df["sex"]
sex_test = test_df["sex"]

In [None]:
features_to_drop = ["income", "sex", "relationship"]

In [None]:
X_train = train_df.drop(columns=features_to_drop)
y_train = train_df["income"]

X_test = test_df.drop(columns=features_to_drop)
y_test = test_df["income"]

# 5. Preprocessing (One-Hot Encoding + Scaling)
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

In [None]:
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Scale data (Helpful for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def print_fairness_report(model_name, y_pred, sensitive_col):
    # Create a temporary dataframe to calculate rates
    res = pd.DataFrame({"sex": sensitive_col, "pred": y_pred})

    # Calculate Positive Rates
    male_pos_rate = res[res["sex"] == "Male"]["pred"].mean()
    female_pos_rate = res[res["sex"] == "Female"]["pred"].mean()

    # Calculate Metrics
    disparate_impact = female_pos_rate / male_pos_rate
    parity_diff = male_pos_rate - female_pos_rate

    print(f"\n=== {model_name} RESULTS ===")
    print(f"Male Positive Prediction Rate:   {male_pos_rate:.1%}")
    print(f"Female Positive Prediction Rate: {female_pos_rate:.1%}")
    print(f"--------------------------------")
    print(f"Disparate Impact (Target ~1.0):  {disparate_impact:.3f}")
    print(f"Parity Difference (Target ~0.0): {parity_diff:.3f}")

In [None]:
print("Training Normal Model...")
model_normal = LogisticRegression(solver='liblinear', random_state=42)
model_normal.fit(X_train_scaled, y_train)

y_pred_normal = model_normal.predict(X_test_scaled)

print(f"Normal Accuracy: {accuracy_score(y_test, y_pred_normal):.4f}")
print_fairness_report("NORMAL (Baseline)", y_pred_normal, sex_test)

Training Normal Model...
Normal Accuracy: 0.8475

=== NORMAL (Baseline) RESULTS ===
Male Positive Prediction Rate:   26.5%
Female Positive Prediction Rate: 6.7%
--------------------------------
Disparate Impact (Target ~1.0):  0.255
Parity Difference (Target ~0.0): 0.197


#LAB-2 Mitigation


In [None]:
print("\nCalculating Weights for Mitigation...")

# Calculate Weights specifically for GENDER
# Formula: W = P(Sex) * P(Income) / P(Sex & Income)
def calculate_gender_weights(df, sex_col, target_col):
    # Counts
    count_total = len(df)
    count_sex = df[sex_col].value_counts()
    count_target = df[target_col].value_counts()

    # Create groups for joint counts
    # e.g. Male_0, Male_1, Female_0, Female_1
    df['temp_group'] = df[sex_col].astype(str) + "_" + df[target_col].astype(str)
    count_joint = df['temp_group'].value_counts()

    weights = []
    for i, row in df.iterrows():
        s = row[sex_col]
        t = row[target_col]
        group_key = f"{s}_{t}"

        # P(Sex) * P(Target) / P(Joint)
        # Note: We divide counts by total to get Probs, but totals cancel out mostly.
        # Simplified: (Count(Sex) * Count(Target) / Total) / Count(Joint)

        w = (count_sex[s] * count_target[t] / count_total) / count_joint[group_key]
        weights.append(w)

    return np.array(weights)


Calculating Weights for Mitigation...


In [None]:
instance_weights = calculate_gender_weights(train_df.copy(), "sex", "income")

In [None]:
print("Training Mitigated Model...")
model_mitigated = LogisticRegression(solver='liblinear', random_state=42)
# FIT WITH WEIGHTS
model_mitigated.fit(X_train_scaled, y_train, sample_weight=instance_weights)

y_pred_mitigated = model_mitigated.predict(X_test_scaled)

print(f"Mitigated Accuracy: {accuracy_score(y_test, y_pred_mitigated):.4f}")
print_fairness_report("MITIGATED (Reweighed)", y_pred_mitigated, sex_test)

Training Mitigated Model...
Mitigated Accuracy: 0.8484

=== MITIGATED (Reweighed) RESULTS ===
Male Positive Prediction Rate:   25.1%
Female Positive Prediction Rate: 7.4%
--------------------------------
Disparate Impact (Target ~1.0):  0.294
Parity Difference (Target ~0.0): 0.177
