In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import xgboost as xgb
from sklearn.metrics import confusion_matrix
import numpy as np
from fairlearn.metrics import demographic_parity_difference

In [3]:
data=pd.read_csv('class.csv', encoding='utf-8')

In [4]:
###转换格式
###转换日期
month_dict = {
    '1月': 'January', '2月': 'February', '3月': 'March',
    '4月': 'April', '5月': 'May', '6月': 'June',
    '7月': 'July', '8月': 'August', '9月': 'September',
    '10月': 'October', '11月': 'November', '12月': 'December'
}
def convert_to_english_date(chinese_date_str):
    month_chinese = chinese_date_str.split('月')[0] + '月'
    year = chinese_date_str.split('月')[1]
    month_english = month_dict.get(month_chinese, 'Unknown')
    return f"{month_english} {year}"

data['Year.Month'] = data['Year.Month'].apply(convert_to_english_date)
data['Year.Month'] = pd.to_datetime(data['Year.Month'], errors='coerce')
data.rename(columns={'Year.Month': 'Date'}, inplace=True)
data.sort_values(by='Date', inplace=True)

  data['Year.Month'] = pd.to_datetime(data['Year.Month'], errors='coerce')


In [5]:
###转换年龄
def classify_age_group(age_group):
    if age_group in ['0 - 4 years inclusive', '5 - 9 years inclusive', '10 - 14 years inclusive', '15 - 19 years inclusive']:
        return 'Teenagers'
    elif age_group in ['20 - 24 years inclusive', '25 - 29 years inclusive', '30 - 34 years inclusive', '35 - 39 years inclusive']:
        return 'Middle-aged-young'
    elif age_group in ['40 - 44 years inclusive', '45 - 49 years inclusive', '50 - 54 years inclusive', '55 - 59 years inclusive']:
        return 'Middle-aged-old'
    elif age_group in ['60 - 64 years inclusive', '65 - 69 years inclusive', '70 - 74 years inclusive', '75 - 79 years inclusive', '80 years or over']:
        return 'Elderly'
    else:
        return 'Unknown'
data['Age.Group.5Yr.Band'] = data['Age.Group.5Yr.Band'].apply(classify_age_group)

In [6]:
data = data[data['Ethnic.Group'] != 'Not Stated']
data = data[data['SEX'] != 'Not Stated']

In [7]:
data

Unnamed: 0,Date,Victimisations,SEX,Age.Group.5Yr.Band,OOI.Exclusion,Ethnic.Group,ANZSOC.Group,class
18395,2015-06-01,2,Female,Middle-aged-young,Court action,Maori,"Theft (Except Motor Vehicles), N.E.C.",0
23448,2015-06-01,1,Male,Middle-aged-young,Court action,European,"Theft (Except Motor Vehicles), N.E.C.",0
62710,2015-06-01,2,Male,Middle-aged-old,Court action,Pacific Island,Serious Assault Resulting in Injury,0
66759,2015-06-01,1,Male,Elderly,Court action,European,Illegal Use of a Motor Vehicle,0
23211,2015-06-01,2,Male,Middle-aged-young,Court action,European,Aggravated Robbery,0
...,...,...,...,...,...,...,...,...
70614,2023-07-01,1,Female,Middle-aged-young,Court action,Pacific Island,Illegal Use of a Motor Vehicle,0
70613,2023-07-01,2,Female,Middle-aged-young,Court action,Pacific Island,Illegal Use of a Motor Vehicle,0
71770,2023-07-01,1,Female,Middle-aged-old,Court action,Asian,Abduction and Kidnapping,0
71771,2023-07-01,1,Female,Middle-aged-young,Court action,European,Abduction and Kidnapping,0


In [8]:
label_cols = ['SEX', 'Age.Group.5Yr.Band', 'Ethnic.Group', 'ANZSOC.Group']
for col in label_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [9]:
X = data.drop(columns=['Date', 'class' ,'OOI.Exclusion'])
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
clf = xgb.XGBClassifier(n_estimators=50, learning_rate=0.1,max_depth=10,subsample=0.1, colsample_bytree=0.8)
clf.fit(X_train, y_train)

In [11]:
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

print(X_train)


Accuracy: 0.8796753143402833
       Victimisations  SEX  Age.Group.5Yr.Band  Ethnic.Group  ANZSOC.Group
26971               3    0                   1             1             9
37976               2    1                   3             5            10
66000               1    0                   2             3             8
14161               1    1                   2             5             4
66176               1    0                   3             5             2
...               ...  ...                 ...           ...           ...
71020               1    1                   2             1             3
60613               1    1                   0             1             4
71202               1    0                   1             3             4
40067               2    0                   2             1             1
82540               1    1                   2             3            10

[50264 rows x 5 columns]


In [191]:
sensitive_feature_test1 = X_test['Ethnic.Group']
sensitive_feature_test2 = X_test['Age.Group.5Yr.Band']
sensitive_feature_test3 = X_test['SEX']


Ethnic_Group_unfairness = demographic_parity_difference(y_test, y_pred, sensitive_features=sensitive_feature_test1)
print(f"Ethnic_Group demographic parity difference: {Ethnic_Group_unfairness}")


Age_Group_unfairness = demographic_parity_difference(y_test, y_pred, sensitive_features=sensitive_feature_test2)
print(f"Age_Group demographic parity difference: {Age_Group_unfairness}")

SEX_unfairness = demographic_parity_difference(y_test, y_pred, sensitive_features=sensitive_feature_test3)
print(f"SEX demographic parity difference: {SEX_unfairness}")





Ethnic_Group demographic parity difference: 0.14369747899159663
Age_Group demographic parity difference: 0.04473406128918633
SEX demographic parity difference: 0.00422577300490902
ANZSOC_Group demographic parity difference: 0.21758569299552907


In [189]:
from sklearn.metrics import roc_curve, auc

def calculate_optimal_thresholds(y_true, y_prob, groups):
    thresholds = {}
    
    # For each group, calculate the ROC curve and pick the threshold that gives
    # the best balance between TPR and FPR
    for group in np.unique(groups):
        fpr, tpr, thresh = roc_curve(y_true[groups == group], y_prob[groups == group])
        optimal_idx = np.argmax(tpr - fpr)
        thresholds[group] = thresh[optimal_idx]*7
        
    return thresholds


# Get the predicted probabilities for the positive class
y_prob = clf.predict_proba(X_test)[:, 1]

# Get the optimal thresholds for each ethnic group
optimal_thresholds = calculate_optimal_thresholds(y_test, y_prob, X_test['Ethnic.Group'])

# Apply the thresholds to make final predictions
y_pred_fair = np.array([int(y_prob[i] > optimal_thresholds[X_test['Ethnic.Group'].iloc[i]]) for i in range(len(y_prob))])

# Calculate the new accuracy
print('Adjusted Accuracy:', accuracy_score(y_test, y_pred_fair))


Adjusted Accuracy: 0.8764921215979627


In [198]:
sensitive_feature_test1 = X_test['Ethnic.Group']

Ethnic_Group_unfairness = demographic_parity_difference(y_test, y_pred_fair, sensitive_features=sensitive_feature_test1)
print(f"Ethnic_Group demographic parity difference: {Ethnic_Group_unfairness}")


Ethnic_Group demographic parity difference: 0.0226890756302521


In [205]:
import numpy as np

# Assuming 'Ethnic.Group' is the protected attribute
groups = data['Ethnic.Group'].unique()

# Dictionaries to store models and predictions for each group
models = {}
predictions = {}

for group in groups:
    # Splitting data based on the group
    X_train_group = X_train[X_train['Ethnic.Group'] == group]
    y_train_group = y_train[X_train['Ethnic.Group'] == group]
    X_test_group = X_test[X_test['Ethnic.Group'] == group]
    
    # Train a model for the group
    clf_group = xgb.XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=10, subsample=0.1, colsample_bytree=0.8)
    clf_group.fit(X_train_group, y_train_group)
    
    # Store the model
    models[group] = clf_group
    
    # Predict probabilities for the positive class
    probas = clf_group.predict_proba(X_test_group)[:, 1]
    
    # Store the predictions
    predictions[group] = probas

# Recalibrate predicted probabilities
average_predicted_proba = np.mean([np.mean(probas) for probas in predictions.values()])
for group, probas in predictions.items():
    predictions[group] = probas + (average_predicted_proba - np.mean(probas))

# Convert recalibrated probabilities to binary predictions using 0.5 as threshold
binary_predictions = {}
for group, probas in predictions.items():
    binary_predictions[group] = (probas > 0.5).astype(int)

# Now, evaluate fairness and performance
for group in groups:
    y_test_group = y_test[X_test['Ethnic.Group'] == group]
    print(f"Accuracy for {group}: {accuracy_score(y_test_group, binary_predictions[group])}")


Accuracy for 3: 0.8997955010224948
Accuracy for 1: 0.8353978978978979
Accuracy for 5: 0.9188069594034797
Accuracy for 4: 0.944547134935305
Accuracy for 2: 0.9033613445378151
Accuracy for 0: 0.9327251995438997
