# Decision Tree Modeling and Fairness Analysis on Recruitment Data

## Import libraries

In [37]:
import pandas as pd # To manipulate the csv data
import numpy as np  # for mathematical calculation

### Observing the dataset

In [38]:
df = pd.read_csv("recruitment.csv") # importing
df.head()

Unnamed: 0,mainid(hidden),mod15(hidden),div15(hidden),"Age(feature, sensitive))",Speed(hidden),"Gender(feature,sensitive)",Strength(hidden),Speedtest(hidden),Lifttest(hidden),testresult(feature),...,Suitability(target),Value-when-hired(hidden),Candidate-loss-when-nothired(hidden),Should-hire(target),Shouldnothire,hired-by-expert,A1(testresult),"A2(testresult,30under)","A3(Age,Gender, test)",A4(postive-dicr)
0,225000,0,0,22,2,male,6,1.0,1.0,2.0,...,4,10000,5000,1,0,1,1,1,1,1
1,225001,1,0,26,2,male,6,0.5,1.0,1.5,...,4,10000,5000,1,0,1,1,1,1,0
2,225002,2,0,28,1,male,6,1.0,1.0,2.0,...,3,5000,5000,1,0,1,1,1,1,1
3,225003,3,0,28,1,male,6,0.5,1.0,1.5,...,3,5000,5000,1,0,1,1,1,1,0
4,225004,4,0,32,2,male,6,1.0,1.0,2.0,...,4,10000,5000,1,0,1,1,1,1,1


Deciding the target/label...

In [39]:
label = "Should-hire(target)"

Creating a backup copy of the dataframe

In [40]:
backup = df.copy()
backup.head()

Unnamed: 0,mainid(hidden),mod15(hidden),div15(hidden),"Age(feature, sensitive))",Speed(hidden),"Gender(feature,sensitive)",Strength(hidden),Speedtest(hidden),Lifttest(hidden),testresult(feature),...,Suitability(target),Value-when-hired(hidden),Candidate-loss-when-nothired(hidden),Should-hire(target),Shouldnothire,hired-by-expert,A1(testresult),"A2(testresult,30under)","A3(Age,Gender, test)",A4(postive-dicr)
0,225000,0,0,22,2,male,6,1.0,1.0,2.0,...,4,10000,5000,1,0,1,1,1,1,1
1,225001,1,0,26,2,male,6,0.5,1.0,1.5,...,4,10000,5000,1,0,1,1,1,1,0
2,225002,2,0,28,1,male,6,1.0,1.0,2.0,...,3,5000,5000,1,0,1,1,1,1,1
3,225003,3,0,28,1,male,6,0.5,1.0,1.5,...,3,5000,5000,1,0,1,1,1,1,0
4,225004,4,0,32,2,male,6,1.0,1.0,2.0,...,4,10000,5000,1,0,1,1,1,1,1


Manipulating the 'df' dataset so that it has only necessary columns

In [41]:
# useful columns
USE_COLS = [
    "Age(feature, sensitive))",
    "Gender(feature,sensitive)",
    "Speed(hidden)",
    "Strength(hidden)",
    "Speedtest(hidden)",
    "Lifttest(hidden)",
    "testresult(feature)",
    "Should-hire(target)"
]

df = df[USE_COLS]
df.head()

Unnamed: 0,"Age(feature, sensitive))","Gender(feature,sensitive)",Speed(hidden),Strength(hidden),Speedtest(hidden),Lifttest(hidden),testresult(feature),Should-hire(target)
0,22,male,2,6,1.0,1.0,2.0,1
1,26,male,2,6,0.5,1.0,1.5,1
2,28,male,1,6,1.0,1.0,2.0,1
3,28,male,1,6,0.5,1.0,1.5,1
4,32,male,2,6,1.0,1.0,2.0,1


Checking if null values exist...

In [42]:
df.isnull().sum()

Age(feature, sensitive))     0
Gender(feature,sensitive)    0
Speed(hidden)                0
Strength(hidden)             0
Speedtest(hidden)            0
Lifttest(hidden)             0
testresult(feature)          0
Should-hire(target)          0
dtype: int64

## Train / Test split

In [43]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df,test_size=0.25,random_state=0)

**Why do we split the entire dataframe here instead of separately splitting features and target like in the logistic regression model?**

- Logistic regression is **feature-matrix based**, so splitting X and y is natural and safer.
- For ID3, the entire dataframe is split because the algorithm operates on a **tabular representation** where the target is treated as a column and splits are chosen based on information gain across attributes. Keeping the dataset intact preserves row structure and aligns with the conceptual and textbook formulation of ID3, especially in from-scratch implementations.


## Creating the Scratch Model

## (1) Calculate entropy of whole dataset

In [44]:
def calc_total_entropy(train_data,label,class_list):
    total_row = train_data.shape[0] # total size of dataset
    total_entropy = 0 

    for c in class_list: # for each class in the label
        total_class_count = train_data[train_data[label] == c].shape[0] # number of class
        total_class_entropy = - (total_class_count/total_row)*np.log2(total_class_count/total_row) # entropy of class
        total_entropy += total_class_entropy # adding the class entropy to total entropy of dataset

    return total_entropy

- train_data: a pandas dataframe
- label: string, name of the label of the dataframe 
- class_list: list, unique classes of the label
- returns: float, calculated entropy of the whole dataframe 

## (2) Calculate entropy for the filtered class

In [45]:
def calc_entropy(feature_value_data,label,class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0

    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0] # row count of class c
        entropy_class = 0
        if label_class_count != 0:
            prob_class = label_class_count/class_count # probability of class
            entropy_class = - prob_class*np.log2(prob_class) # entropy of that class
        entropy += entropy_class # finding total entropy of filtered class
    return entropy

- This function calculates entropy of a specific feature = value.
- feature_value_data: a pandas dataframe, which contains data that has a specific value of a feature.
- label: string, name of the label of the dataframe
- class_list: list, unique classes of the label
- returns: float, calculated entropy of the feature value dataframe 

## (3) Calculate information gain for a feature

In [46]:
def calc_info_gain(feature_name,train_data,label,class_list):
    feature_value_list = train_data[feature_name].unique() # unique values of the feature
    total_row = train_data.shape[0]
    feature_info = 0.0

    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value] # filtering rows with that feature_value
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data,label,class_list) # calculating entropy for feature value
        feature_value_prob = feature_value_count/total_row
        feature_info += feature_value_prob * feature_value_entropy # calculating information of the feature value

    return calc_total_entropy(train_data,label,class_list) - feature_info # calculating information gain by subtracting

- The above function calculates information gain of a feature.
- feature_name: string, the name of the feature that we want to find the information gain 
- train_data: a pandas dataframe
- label: string, name of the label of the dataframe 
- class_list: list, unique classes of the label 
- returns: calculated information gain of the feature 

## (4) Finding the most informative feature 

(feature with highest information gain)

In [47]:
def find_most_info_feature(train_data,label,class_list):
    feature_list = train_data.columns.drop(label) # finding the feature names in the dataset 
                                                 # (Note: label is not a feature,so dropping it) 
    max_info_gain = -float("inf")
    max_info_feature = None

    for feature in feature_list:
        feature_info_gain = calc_info_gain(feature,train_data,label,class_list)
        if feature_info_gain > max_info_gain: # selecting feature name with highest information gain
            max_info_gain = feature_info_gain
            max_info_feature = feature

    return max_info_feature

- The above function finds the most informative feature from the current dataset.
- train_data: a pandas dataframe
- label: string, name of the label of the dataframe 
- class_list: list, unique classes of the label 
- returns: string, the feature name

## (5) Adding node to tree

In [48]:
def generate_sub_tree(feature_name,train_data,label,class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False) # dictionary of the count of unique feature value
    tree = {} # The subtree or node is stored in this dictionary

    for feature_value,count in feature_value_count_dict.items(): 
        feature_value_data = train_data[train_data[feature_name] == feature_value] # dataset with only feature_name = feature_value

        assigned_to_node = False # flag to track whether the feature_value is a pure class or not
        for c in class_list:
            class_count = feature_value_data[feature_value_data[label] == c].shape[0] # count of class c

            if class_count == count: 
                tree[feature_value] = c # adding node to tree
                train_data = train_data[train_data[feature_name] != feature_value] # removing rows with feature_value
                assigned_to_node = True
        if not assigned_to_node: # not pure class
            tree[feature_value] = "?" # as feature_value is not a pure class,it should be expanded further
                                     # so mark the branch with ?
    return tree,train_data

After selecting a pure class, we have to remove the rows from the dataset corresponding to the feature value.

Note : Here we have to use the updated dataset for the next iterations.

- The above function generates subtree of a feature and removes the feature = value from the dataset.
- The tree might contain ‘?’ as a value if the tree node isn’t a pure class.
- feature_name: string, the name of the feature that we want to add to tree and shrink dataset
- train_data: a pandas dataframe
- label: string, name of the label of the dataframe 
- class_list: list, unique classes of the label
- returns: tuple (dictionary, dataframe), the tree node with it’s branches and the updated dataset

## Performing ID3 Algorithm and generating tree

Now, we should ensemble the methods which recursively perform Step 1 — Step 5. The overall method should : 

- Find the most informative feature
- Make a tree node with feature name and feature values as branches
    - If it is a pure class, add leaf node (= Class) to the tree node
    - If it is an impure class, add an expandable node (= ‘?’) to the tree node
- Shrink/Update the dataset according to the pure class
- Add the node with branches into a tree
- Expand the branch of the next impure class (= ‘?’) with an updated dataset
  
The recursion endpoint:

- The dataset becomes empty after updating
- There is no expandable branch (= all pure class)

In [49]:
def make_tree(root,prev_feature_value,train_data,label,class_list):
    if train_data.shape[0] != 0 : # dataset is empty after updating
        max_info_feature = find_most_info_feature(train_data,label,class_list) # finding the most informative feature
        tree,train_data = generate_sub_tree(max_info_feature,train_data,label,class_list) # getting tree node and updated dataset
        next_root = None

        if prev_feature_value != None : # add to intermediate node of tree
            root[prev_feature_value] = dict() # The dict() function in Python is a built-in constructor used to create dictionaries
            root[prev_feature_value][max_info_feature] = tree
            next_root = root[prev_feature_value][max_info_feature]
        else: # add to root of tree
            root[max_info_feature] = tree
            next_root = root[max_info_feature]

        for node,branch in list(next_root.items()): # iterating the tree node # .items() method in Python dictionaries returns a view object that displays a list of key-value pairs as tuples
            if branch == "?": # if it is expandable
                feature_value_data = train_data[train_data[max_info_feature] == node] # using the updated dataset
                make_tree(next_root,node,feature_value_data,label,class_list) # recursive call with updated dataset

- The above function generates a tree using dictionary of dictionaries.
- The leaf node of the tree would be value of a feature = class name.
- The resultant tree is returned by reference.
  
- root: dictionary, which will contain the resultant tree through recursive subtree . Initially it should be an empty dictionary. After recursive calls , it should contain the result
- prev_feature_value: Any datatype (Int or Float or String etc.) depending on the datatype of the previous feature. The previous value of the pointed node/feature . Initially it should be None
- train_data: a pandas dataframe
- label: string, name of the label of the dataframe 
- class_list: list, unique classes of the label 
- returns: None


## Finding unique classes of the label and Starting the algorithm

We can start calling the recursive tree building algorithm of ID3 after finding the class names.

In [50]:
def id3(df,label):
    train_data = df.copy() # getting a copy of the dataset
    tree = {} # tree dictionary 
    class_list = train_data[label].unique() # getting unique classes of the label
    make_tree(tree,None,train_data,label,class_list) # tree is built through recursion

    return tree

- The above function generates id3 tree.
- df: a pandas dataframe
- label: string, name of the label of the dataframe 
- returns: (nested) dictionary, the decision tree

## Predicting from the tree

We will recursively traverse the nested dictionary until any leaf node (class) is found

In [51]:
def predict(tree,instance):
    if not isinstance(tree,dict): # if it is leaf node
        return tree # return the value
    else:
        root_node = next(iter(tree)) # getting first key/feature name of dictionary
        feature_value = instance[root_node] # value of the feature
        if feature_value in tree[root_node]: # checking the feature value in current tree node
            return predict(tree[root_node][feature_value],instance) # goto next feature
        else:
            return DEFAULT_CLASS

- The above function predicts from the generated tree using the feature set/instance
- tree: dictionary (of dictionaries), a decision tree
- instance: a row or snapshot or set of the features of dataset. The row may not contain label
- returns: Any datatype (Int or Float or String etc.) depending on the datatype of the class, the predicted class

## Evaluating test dataset

In [52]:
def evaluate(tree, test_df, label):
    correct_preditct = 0
    wrong_preditct = 0
    
    for index, row in test_df.iterrows(): #for each row in the dataset
        result = predict(tree, row) 
        if result == row[label]: 
            correct_preditct += 1 #increase correct count
        else:
            wrong_preditct += 1 #increase incorrect count
            
    accuracy = correct_preditct / (correct_preditct + wrong_preditct) #calculating accuracy
    return accuracy

- Evaluates the accuracy of a id3 tree by testing against the expected result
- tree: dictionary (of dictionaries), a decision tree
- test_data: a pandas dataframe/test dataset
- returns: float, the accuracy of the tree

## IMPLEMENTATION

## Train the ID3 tree

In [53]:
tree = id3(train_df, label)

In [54]:
DEFAULT_CLASS = train_df[label].mode()[0]

## Generate predictions on test data

In [55]:
y_true = []
y_pred = []

for _, row in test_df.iterrows():
    y_true.append(row[label])
    y_pred.append(predict(tree, row))


## Evaluating the model

In [56]:
acc = evaluate(tree, test_df, label)
print(acc*100,"%")

100.0 %


## Confusion Matrix

In [57]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
print(cm)

[[41  0]
 [ 0 16]]


## Performance Metrics

In [58]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        16

    accuracy                           1.00        57
   macro avg       1.00      1.00      1.00        57
weighted avg       1.00      1.00      1.00        57



## Root of the tree

The most influential feature in the decision tree was {root_feature}, as it appeared at the root node and therefore provided the highest information gain.

In [59]:
root_feature = next(iter(tree))
print("Most important feature:", root_feature)

Most important feature: Speed(hidden)


## Prepare fairness groups

### Create age brackets

In [60]:
test_df["age_group"] = pd.cut(
    test_df["Age(feature, sensitive))"],
    bins=[0, 25, 35, 100],
    labels=["<25", "25–35", "35+"]
)

## Demographic Parity

## Are hiring decisions independent of gender or age?

In [61]:
# Wrt Gender
print("Wrt Gender")
for g in test_df["Gender(feature,sensitive)"].unique():
    group = test_df[test_df["Gender(feature,sensitive)"] == g]
    preds = [predict(tree, row) for _, row in group.iterrows()]
    rate = sum(preds) / len(preds)
    print(f"{g}: hiring rate = {rate:.3f}")
print("\n")

# Wrt Age
print("Wrt Age")
for a in test_df["age_group"].unique():
    group = test_df[test_df["age_group"] == a]
    preds = [predict(tree, row) for _, row in group.iterrows()]
    rate = sum(preds) / len(preds)
    print(f"{a}: hiring rate = {rate:.3f}")


Wrt Gender
male: hiring rate = 0.316
female: hiring rate = 0.211


Wrt Age
25–35: hiring rate = 0.263
35+: hiring rate = 0.154
<25: hiring rate = 0.583


### Observations:

**Wrt Gender :**
- Male hiring rate: 0.316 (31.6%)
- Female hiring rate: 0.211 (21.1%)

**This implies:**
- Males are hired ~10.5 percentage points more often than females.

- Hiring decisions are not independent of gender.

**Wrt Age :**
- <25: 58.3%
- 25–35: 26.3%
- 35+: 15.4%

**This implies:**
- Very strong age dependence
- Candidates under 25 are more than twice as likely to be hired

`Therefore, hiring decisions produced by the model are not independent of either gender or age.`

## Equal Opportunity

Equal opportunity checks whether qualified candidates from all groups are equally likely to be hired.

In [62]:
qualified = test_df[test_df[label] == 1]

In [63]:
print("Wrt Gender")
for g in qualified["Gender(feature,sensitive)"].unique():
    group = qualified[qualified["Gender(feature,sensitive)"] == g]
    preds = [predict(tree, row) for _, row in group.iterrows()]
    rate = sum(preds) / len(preds)
    print(f"{g}: true positive rate = {rate:.3f}")
print("\n")

print("Wrt Age")
for a in qualified["age_group"].unique():
    group = qualified[qualified["age_group"] == a]
    preds = [predict(tree, row) for _, row in group.iterrows()]
    rate = sum(preds) / len(preds)
    print(f"{a}: true positive rate = {rate:.3f}")


Wrt Gender
male: true positive rate = 1.000
female: true positive rate = 1.000


Wrt Age
25–35: true positive rate = 1.000
35+: true positive rate = 1.000
<25: true positive rate = 1.000


Every candidate who truly should have been hired was hired by the model, regardless of gender or age.

**Are outcomes the same for everyone?**
- It hires fewer women and older candidates overall..
  
**Are qualified people treated equally?**
- But when they are qualified, it never rejects them.


**Conclusion**

This analysis demonstrates that a model can satisfy equal opportunity while still violating demographic parity, emphasizing the importance of evaluating multiple fairness metrics when deploying machine learning systems in high-stakes domains such as recruitment.