### Bias & Fairness in Data: Bias Mitigation Techniques
**Question**: Use the Adult Income dataset and apply reweighing technique to balance the
class weights based on sensitive attributes (e.g., gender).

In [None]:

import pandas as pd
import numpy as np
column_names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
               names=column_names,
               na_values=' ?',
               skipinitialspace=True)
df['income']=df['income'].replace({'<=50K.':'<=50K','>50K.':'>50K'})
sensitive_attr='sex'
target_var='income'
overall_income_dist=df[target_var].value_counts(normalize=True)
sex_dist=df[sensitive_attr].value_counts(normalize=True)
joint_dist=df.groupby([sensitive_attr,target_var]).size().unstack(fill_value=0)
joint_dist_norm=joint_dist.div(joint_dist.sum().sum(),axis=0)
df['sample_weight']=1.0 
for s_val in df[sensitive_attr].unique():
    for t_val in df[target_var].unique():
        prob_joint=len(df[(df[sensitive_attr]==s_val)&(df[target_var]==t_val)])/len(df)
        prob_target=overall_income_dist.get(t_val,0)
        prob_sensitive=sex_dist.get(s_val,0)
        #Avoiddivisionbyzero
        if prob_joint>0:
            #Calculateweight: w(A=s_val,Y=t_val) = P(Y=t_val) * P(A=s_val) / P(Y=t_val, A=s_val)
            #Thisformulatries tomatchjointdistributionsofbalanceddata.
            #Acommonalternative (used in aif360) is to achieve independence: P(Y=y) / P(Y=y|A=a)
            #Let'suseasimplevariant: (P(Y=t_val) / P(Y=t_val|A=s_val))
            #Where P(Y=t_val|A=s_val) = prob_joint / prob_sensitive
            #Weight = (P(Y=t_val) / P(Y=t_val|A=s_val))
            #Thisversionfocusesonbalancingtheoutcomeproportionwithin each group
            #If P(Y=t_val|A=s_val) is 0, this can cause division by zero.
            #Arobustweightingformula,e.g.,fromaif360,is:
            #weight(a, y) = P(Y=y) / P(Y=y | A=a) * P(A=a) / P(A=a)
            #weight(a, y) = P(Y=y) / P(Y=y | A=a)
            #Butwecantry: w(a,y) = (P(Y=y) * P(A=a)) / P(Y=y, A=a)
            #ThismakesP(Y=y,A=a)proportionaltoP(Y=y)*P(A=a),i.e.,makesYandAindependent.
            desired_joint_prob=(prob_target*prob_sensitive)
            if desired_joint_prob > 0:
                weight_val=desired_joint_prob/prob_joint
                df.loc[(df[sensitive_attr]==s_val)&(df[target_var]==t_val),'sample_weight']=weight_val
            else: #If desired_joint_prob is 0, and prob_joint is not, it means an impossible combination is present.
                df.loc[(df[sensitive_attr]==s_val)&(df[target_var]==t_val),'sample_weight']=0 #Assign0weight
        else: #Ifprob_jointis0,thiscombinationdoesntexist,so weightis0
            df.loc[(df[sensitive_attr]==s_val)&(df[target_var]==t_val),'sample_weight']=0
print("OriginalGenderIncomeCrosstab(Counts):")
print(pd.crosstab(df[sensitive_attr],df[target_var]))
print("\nOriginalGenderIncomeCrosstab(ProportionswithinGender):")
print(pd.crosstab(df[sensitive_attr],df[target_var],normalize='index'))
print("\nWeightedGenderIncomeCrosstab(Sums of Weights):")
print(df.groupby([sensitive_attr,target_var])['sample_weight'].sum().unstack(fill_value=0))
print("\nWeightedGenderIncomeCrosstab(ProportionswithinGenderfromWeights):")
weighted_crosstab=df.groupby([sensitive_attr,target_var])['sample_weight'].sum().unstack(fill_value=0)
print(weighted_crosstab.div(weighted_crosstab.sum(axis=1),axis=0))
print("\nFirst5rowsWithSampleWeights:")
print(df[['sex','income','sample_weight']].head())

OriginalGenderIncomeCrosstab(Counts):
income  <=50K  >50K
sex                
Female   9592  1179
Male    15128  6662

OriginalGenderIncomeCrosstab(ProportionswithinGender):
income     <=50K      >50K
sex                       
Female  0.890539  0.109461
Male    0.694263  0.305737

WeightedGenderIncomeCrosstab(Sums of Weights):
income         <=50K         >50K
sex                              
Female   8177.240257  2593.759743
Male    16542.759743  5247.240257

WeightedGenderIncomeCrosstab(ProportionswithinGenderfromWeights):
income    <=50K     >50K
sex                     
Female  0.75919  0.24081
Male    0.75919  0.24081

First5rowsWithSampleWeights:
      sex income  sample_weight
0    Male  <=50K       1.093519
1    Male  <=50K       1.093519
2    Male  <=50K       1.093519
3    Male  <=50K       1.093519
4  Female  <=50K       0.852506
