# Experiment 01: Detecting Algorithmic Bias in a Hiring Dataset

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# -- model --
from sklearn.linear_model import LogisticRegression

In [2]:
!pip install fairlearn

Collecting fairlearn
  Downloading fairlearn-0.12.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.12.0-py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fairlearn
Successfully installed fairlearn-0.12.0


In [3]:
from fairlearn.metrics import MetricFrame, selection_rate

- Loading Dataset

In [4]:
url = '/kaggle/input/adult-test-csv/adult.csv'
data = pd.read_csv(url)

In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

- preprocessing data

In [7]:
data = data[['age', 'educational-num', 'hours-per-week', 'gender', 'income']]
data

Unnamed: 0,age,educational-num,hours-per-week,gender,income
0,25,7,40,Male,<=50K
1,38,9,50,Male,<=50K
2,28,12,40,Male,>50K
3,44,10,40,Male,>50K
4,18,10,30,Female,<=50K
...,...,...,...,...,...
48837,27,12,38,Female,<=50K
48838,40,9,40,Male,>50K
48839,58,9,40,Female,<=50K
48840,22,9,20,Male,<=50K


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   educational-num  48842 non-null  int64 
 2   hours-per-week   48842 non-null  int64 
 3   gender           48842 non-null  object
 4   income           48842 non-null  object
dtypes: int64(3), object(2)
memory usage: 1.9+ MB


In [9]:
data = pd.get_dummies(data, drop_first=True)
data

Unnamed: 0,age,educational-num,hours-per-week,gender_Male,income_>50K
0,25,7,40,True,False
1,38,9,50,True,False
2,28,12,40,True,True
3,44,10,40,True,True
4,18,10,30,False,False
...,...,...,...,...,...
48837,27,12,38,False,False
48838,40,9,40,True,True
48839,58,9,40,False,False
48840,22,9,20,True,False


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   age              48842 non-null  int64
 1   educational-num  48842 non-null  int64
 2   hours-per-week   48842 non-null  int64
 3   gender_Male      48842 non-null  bool 
 4   income_>50K      48842 non-null  bool 
dtypes: bool(2), int64(3)
memory usage: 1.2 MB


In [11]:
X = data.drop('income_>50K', axis=1)
y = data['income_>50K']

In [12]:
X.head()

Unnamed: 0,age,educational-num,hours-per-week,gender_Male
0,25,7,40,True
1,38,9,50,True
2,28,12,40,True
3,44,10,40,True
4,18,10,30,False


In [13]:
y.head()

0    False
1    False
2     True
3     True
4    False
Name: income_>50K, dtype: bool

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)
y_pred

array([False, False, False, ..., False, False, False])

In [17]:
# Fairness evaluation
sex = X_test['gender_Male']
metric_frame = MetricFrame(metrics=selection_rate,
y_true=y_test,
y_pred=y_pred,
sensitive_features=sex)
print("Selection Rates by Gender:\n", metric_frame.by_group)

Selection Rates by Gender:
 gender_Male
False    0.013193
True     0.200775
Name: selection_rate, dtype: float64


---

In [18]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# -- model --
from sklearn.linear_model import LogisticRegression

In [19]:
!pip install fairlearn



In [20]:
from fairlearn.metrics import MetricFrame, selection_rate

In [21]:
url = '/kaggle/input/mobile-price-classification/train.csv'
data = pd.read_csv(url)

In [22]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [23]:
data.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [24]:
features = ["dual_sim", "fc", "four_g", "int_memory", "n_cores", "three_g"]
df = data[features]

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   dual_sim    2000 non-null   int64
 1   fc          2000 non-null   int64
 2   four_g      2000 non-null   int64
 3   int_memory  2000 non-null   int64
 4   n_cores     2000 non-null   int64
 5   three_g     2000 non-null   int64
dtypes: int64(6)
memory usage: 93.9 KB


In [26]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,dual_sim,fc,four_g,int_memory,n_cores,three_g
0,0,1,0,7,2,0
1,1,0,1,53,3,1
2,1,2,1,41,5,1
3,0,0,0,10,6,1
4,0,13,1,44,2,1


In [27]:
dual_sim_col = []
four_g_col = []
three_g_col = []

for v in df["dual_sim"]:
    dual_sim_col.append(str(v))

df["dual_sim"] = dual_sim_col

for v in df["four_g"]:
    four_g_col.append(str(v))

df["four_g"] = four_g_col

for v in df["three_g"]:
    three_g_col.append(str(v))

df["three_g"] = three_g_col

In [28]:
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,fc,int_memory,n_cores,dual_sim_1,four_g_1,three_g_1
0,1,7,2,False,False,False
1,0,53,3,True,True,True
2,2,41,5,True,True,True
3,0,10,6,False,False,True
4,13,44,2,False,True,True


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   fc          2000 non-null   int64
 1   int_memory  2000 non-null   int64
 2   n_cores     2000 non-null   int64
 3   dual_sim_1  2000 non-null   bool 
 4   four_g_1    2000 non-null   bool 
 5   three_g_1   2000 non-null   bool 
dtypes: bool(3), int64(3)
memory usage: 52.9 KB


In [30]:
X = df.drop(["three_g_1"], axis=1)
y = df["three_g_1"]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)
y_pred

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
       False,  True,  True, False,  True,  True,  True,  True, False,
       False,  True,  True, False, False, False,  True,  True,  True,
        True,  True,

In [34]:
# Fairness evaluation
sim = X_test['dual_sim_1']
metric_frame = MetricFrame(metrics=selection_rate,
y_true=y_test,
y_pred=y_pred,
sensitive_features=sim)
print("Selection Rates by Dual Sim:\n", metric_frame.by_group)

Selection Rates by Dual Sim:
 dual_sim_1
False    0.968153
True     0.695804
Name: selection_rate, dtype: float64


#### Conclusion
The selection rate represents the proportion of individuals for whom the model predicted a positive outcome (in this case, having a three-g capability). Looking at the output, it seems there is a significant difference in the selection rate between devices with dual SIM capabilities (True) and those without (False). Devices without dual SIM have a much higher selection rate (0.968) compared to devices with dual SIM (0.696). This suggests a potential bias in the model's predictions regarding the presence of three-g based on whether a device has dual SIM.