In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/resume-cleaned-dataset/df_cleaned.csv


In [2]:
df = pd.read_csv('/kaggle/input/resume-cleaned-dataset/df_cleaned.csv')
df.head()

Unnamed: 0,matched_score,education_similarity,experience_years,cosine_similarity_skills,highest_degree,ed_req_encoded,exp_req_encoded
0,0.85,0.318784,5.5,0.0,4,4,1
1,0.75,0.375598,5.66,0.0,5,5,5
2,0.416667,0.093495,6.92,0.0,4,5,3
3,0.76,0.0,13.83,0.0,5,4,1
4,0.65,0.312103,17.33,0.0,4,4,4


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop('matched_score', axis=1)
y = df['matched_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=5, random_state=42)  
model.fit(X_train, y_train)


In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)


Mean Squared Error: 0.02383618557900226
Mean Absolute Error: 0.12186299721435206
R² Score: 0.14517424381872346


In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 10,15,20, None],
    'min_samples_split': [2, 5, 10,15,20],
    'min_samples_leaf': [1, 2, 4,6,8,10]
}

grid = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best R² Score:", grid.best_score_)


Best parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best R² Score: 0.19659805315685838


In [11]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=10,min_samples_leaf=10,min_samples_split=2, random_state=42)  
model.fit(X_train, y_train)


In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)


Mean Squared Error: 0.022996389251256952
Mean Absolute Error: 0.11688559515034734
R² Score: 0.17529145903018473


# Final Model

In [15]:
final_model = DecisionTreeRegressor(max_depth=10,min_samples_leaf=10,min_samples_split=2, random_state=42)
final_model.fit(X_train, y_train)

# AI fairness360


In [16]:
!pip install aif360
!pip install folktables


Collecting aif360
  Downloading aif360-0.6.1-py3-none-any.whl.metadata (5.0 kB)
Downloading aif360-0.6.1-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.7/259.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aif360
Successfully installed aif360-0.6.1
Collecting folktables
  Downloading folktables-0.0.12-py3-none-any.whl.metadata (533 bytes)
Downloading folktables-0.0.12-py3-none-any.whl (17 kB)
Installing collected packages: folktables
Successfully installed folktables-0.0.12


In [17]:
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
import pandas as pd

def check_fairness(df, protected_attr, group_pairs, label_column='matched_score', threshold=0.7):
    """
    Check fairness metrics (SPD, Disparate Impact, Mean Diff) using AIF360 for multiple group pairs.

    Parameters:
    - df: pandas DataFrame (input data)
    - protected_attr: str, the column to check for fairness
    - group_pairs: list of (unprivileged_value, privileged_value)
    - label_column: str, column with the outcome (default = 'matched_score')
    - threshold: float, value to binarize label (1 if >= threshold, else 0)
    """

    # Step 1: Copy & binarize label
    df_fair = df.copy()
    df_fair[label_column] = (df_fair[label_column] >= threshold).astype(float)

    results = []

    for unpriv, priv in group_pairs:
        print(f"\n🔍 Checking Fairness on '{protected_attr}' | Unprivileged = {unpriv}, Privileged = {priv}")

        dataset = BinaryLabelDataset(
            df=df_fair,
            label_names=[label_column],
            protected_attribute_names=[protected_attr],
            favorable_label=1.0,
            unfavorable_label=0.0
        )

        # Define groups
        dataset.unprivileged_protected_attributes = [{protected_attr: unpriv}]
        dataset.privileged_protected_attributes = [{protected_attr: priv}]

        metric = BinaryLabelDatasetMetric(dataset,
                                          unprivileged_groups=[{protected_attr: unpriv}],
                                          privileged_groups=[{protected_attr: priv}])

        spd = metric.statistical_parity_difference()
        di = metric.disparate_impact()
        md = metric.mean_difference()

        print(f"  📊 Statistical Parity Difference : {spd:.4f}")
        print(f"  📉 Disparate Impact              : {di:.4f}")
        print(f"  🔁 Mean Difference               : {md:.4f}")

        results.append({
            'protected_attr': protected_attr,
            'unprivileged': unpriv,
            'privileged': priv,
            'SPD': spd,
            'Disparate_Impact': di,
            'Mean_Difference': md
        })

    return pd.DataFrame(results)


2025-05-06 16:41:39.960042: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746549700.284180      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746549700.379874      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [18]:
degree_pairs = [(3.0, 4.0), (3.0, 5.0), (3.0, 6.0), (4.0, 5.0),(4.0, 6.0)]

fairness_df = check_fairness(df, protected_attr='highest_degree', group_pairs=degree_pairs)



🔍 Checking Fairness on 'highest_degree' | Unprivileged = 3.0, Privileged = 4.0
  📊 Statistical Parity Difference : 0.1225
  📉 Disparate Impact              : 1.2745
  🔁 Mean Difference               : 0.1225

🔍 Checking Fairness on 'highest_degree' | Unprivileged = 3.0, Privileged = 5.0
  📊 Statistical Parity Difference : 0.0917
  📉 Disparate Impact              : 1.1923
  🔁 Mean Difference               : 0.0917

🔍 Checking Fairness on 'highest_degree' | Unprivileged = 3.0, Privileged = 6.0
  📊 Statistical Parity Difference : 0.1473
  📉 Disparate Impact              : 1.3495
  🔁 Mean Difference               : 0.1473

🔍 Checking Fairness on 'highest_degree' | Unprivileged = 4.0, Privileged = 5.0
  📊 Statistical Parity Difference : -0.0308
  📉 Disparate Impact              : 0.9355
  🔁 Mean Difference               : -0.0308

🔍 Checking Fairness on 'highest_degree' | Unprivileged = 4.0, Privileged = 6.0
  📊 Statistical Parity Difference : 0.0248
  📉 Disparate Impact              : 1.0

In [19]:
print(df['ed_req_encoded'].value_counts())

ed_req_encoded
4    4889
5    3671
Name: count, dtype: int64


In [20]:
print(df['exp_req_encoded'].value_counts())

exp_req_encoded
5     2140
1     1836
3     1527
0     1223
2      917
4      611
15     306
Name: count, dtype: int64


In [21]:
print(df['highest_degree'].value_counts())


highest_degree
 4    4668
 5    3434
 3     262
 6     140
-1      56
Name: count, dtype: int64
