In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Data and evaluation
## Data loading

In [None]:
df = pd.read_csv('/content/drive/MyDrive/TAI_HW4_FAIRNESS/data.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
features = list(df.columns)
features.remove('income')

## Evaluation metrics

In [None]:
MALE = 1
FEMALE = 0

def zemel_fairness(y_pred, genders):
  # zemel_fairness = P(Y=1 | gender=MALE) - P(Y=1 | gender=FEMALE)

  p_pos_gender0 = np.mean(y_pred[genders == MALE])
  p_pos_gender1 = np.mean(y_pred[genders == FEMALE])

  return p_pos_gender0 - p_pos_gender1

In [None]:
def disparate_impact(y_pred, genders):
  # disparate_impact = P(Y=1 | gender=FEMALE) - P(Y=1 | gender=FEMALE)

  p_pos_gender0 = np.mean(y_pred[genders == MALE])
  p_pos_gender1 = np.mean(y_pred[genders == FEMALE])

  return p_pos_gender1 / p_pos_gender0

Testing the metrics on dataset:

In [None]:
zemel_fairness(df['income'], df['gender'])

In [None]:
disparate_impact(df['income'], df['gender'])

# Base model implementation

In [None]:
train, test = train_test_split(df, test_size=0.3)

In [None]:
x_train, y_train = train[features], train['income']
x_test, y_test = test[features], test['income']

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [None]:
clf = make_pipeline(StandardScaler(), xgb.XGBClassifier())
clf.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
zemel_fairness(y_pred, x_test['gender'])

In [None]:
disparate_impact(y_pred, x_test['gender'])

# Unbiased model implementation

In [None]:
prob_income = clf.predict_proba(df[features])
pred_income = clf.predict(df[features])

In [None]:
dfm = df.copy()
dfm['prob_income'] = np.max(prob_income, axis=1)
dfm['pred_income'] = pred_income

In [None]:
cp = dfm[(dfm['gender'] == MALE) & (dfm['income'] == 1)]
cp = cp.sort_values('prob_income')

In [None]:
cd = dfm[(dfm['gender'] == FEMALE) & (dfm['income'] == 0)]
cd = cd.sort_values('prob_income', ascending=False)

In [None]:
n_male = (dfm['gender'] == MALE).sum()
n_female = (dfm['gender'] == FEMALE).sum()
n_male_income = ((dfm['gender'] == MALE) & (dfm['pred_income'] == 1)).sum()
n_female_income = ((dfm['gender'] == FEMALE) & (dfm['pred_income'] == 1)).sum()

In [None]:
N = ((n_female * n_male_income) - (n_male * n_female_income)) / (n_male + n_female)

In [None]:
N

In [None]:
for i in range(round(N)):
  cp_c = cp.iloc[i]['income']
  cd_c = cd.iloc[i]['income']

  cp.iloc[i]['income'] = cd_c
  cd.iloc[i]['income'] = cp_c

In [None]:
new_df = pd.concat([cp, cd]).sample(frac=1)

In [None]:
train, test = train_test_split(new_df, test_size=0.3)

In [None]:
x_train, y_train = train[features], train['income']
x_test, y_test = test[features], test['income']

In [None]:
clf = make_pipeline(StandardScaler(), xgb.XGBClassifier())
clf.fit(x_train, y_train)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
zemel_fairness(y_pred, x_test['gender'])

In [None]:
disparate_impact(y_pred, x_test['gender'])