In [325]:
#load required libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import Markdown, display
import sklearn
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder, LabelEncoder
from collections import defaultdict



# Fairness metrics
import aif360
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset
from aif360.datasets.multiclass_label_dataset import MulticlassLabelDataset
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr
from aif360.sklearn.metrics import generalized_fnr, difference
# # Explainers
from aif360.explainers import MetricTextExplainer


# # Scalers
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# # Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#from sklearn.metrics import precision


In [274]:
#1.Load data to dataframe
df = pd.read_csv("2020-12-metropolitan-stop-and-search.csv")


In [275]:
#2. PREPROCESS
#<---code for preprocessing goes here-->
#columns with high missing values
mising_col = ['Outcome linked to object of search', 'Removal of more than just outer clothing',
                'Policing operation','Self-defined ethnicity', 'Date', 'Legislation']
df= df.drop(axis=1, columns=mising_col)
df.dropna(axis=0, subset=['Age range', 'Latitude', 'Longitude', 'Outcome', 'Gender', 
                          'Officer-defined ethnicity', 'Object of search'], inplace=True)


In [276]:
#df.isnull().sum()

In [277]:
df['Age range'].unique()

array(['18-24', '25-34', 'over 34', 'Oct-17', 'under 10'], dtype=object)

In [278]:
# fix an a bug or an error in the Age range column. Change Oct to 10
df['Age range'] = df['Age range'].replace(['Oct-17'], '10-17')

In [279]:
#df['Age range'].unique()



In [280]:
df.head(10)

Unnamed: 0,Type,Part of a policing operation,Latitude,Longitude,Gender,Age range,Officer-defined ethnicity,Object of search,Outcome
0,Person search,False,51.528724,-0.017482,Male,18-24,Black,Stolen goods,Arrest
1,Person and Vehicle search,False,51.461264,-0.303021,Male,18-24,White,Controlled drugs,A no further action disposal
3,Person search,False,51.461264,-0.303021,Male,25-34,White,Controlled drugs,A no further action disposal
4,Person and Vehicle search,False,51.462786,-0.284063,Male,18-24,White,Controlled drugs,A no further action disposal
5,Person search,False,51.469138,-0.171837,Male,18-24,Other,Offensive weapons,A no further action disposal
6,Person search,False,51.3815,-0.243793,Male,25-34,White,Evidence of offences under the Act,A no further action disposal
7,Person and Vehicle search,False,51.461264,-0.303021,Male,18-24,White,Controlled drugs,Arrest
8,Person search,False,51.561515,0.066196,Male,25-34,Black,Controlled drugs,A no further action disposal
10,Person search,False,51.561515,0.066196,Male,25-34,White,Controlled drugs,A no further action disposal
11,Person and Vehicle search,False,51.475038,-0.404085,Male,25-34,White,Controlled drugs,A no further action disposal


In [281]:
#Apply Encoders to the required variables
le = LabelEncoder()
oe = OrdinalEncoder(categories=[['under 10', '18-24', '25-34', 'over 34', '10-17']])
oeOS = OrdinalEncoder(categories=[['Stolen goods', 'Controlled drugs', 'Offensive weapons',
       'Evidence of offences under the Act',
       'Anything to threaten or harm anyone', 'Firearms', 'Fireworks',
       'Articles for use in criminal damage']])
ohe = OneHotEncoder()
df[["Age range"]] = oe.fit_transform(df[["Age range"]].values.reshape((-1, 1)))
df[['Gender']] = OrdinalEncoder().fit_transform(df[['Gender']].values.reshape((-1, 1)))
df[['Object of search']] = oeOS.fit_transform(df[['Object of search']].values.reshape((-1, 1)))
#print(df.columns)
df[['Officer-defined ethnicity']] = OrdinalEncoder().fit_transform(df[['Officer-defined ethnicity']].values.reshape((-1, 1)))
df.Outcome = LabelEncoder().fit_transform(df.Outcome)
df = pd.get_dummies(df, columns=['Type', 'Part of a policing operation'])

In [282]:
#df.head(10)

In [283]:
# dataset = PoliceDataset(df=df, label_name='Outcome', favorable_classes=[le.transform(['A no further action disposal'])],
#                         protected_attribute_names= ['Age range'],
#                         privileged_classes=[oe.transform(np.asarray(['25-34', '10-17','over 34', 'under 10']).reshape(-1,1))])
#                         #protected_attribute_names= ['Age'],  privileged_classes= [[1.0]])


In [284]:
### Encode Categorical variables

In [285]:
# Encode Male as 1, Female as 0
df.loc[df.Gender == 1.0, 'Gender'] = 1
df.loc[df.Gender == 0.0, 'Gender'] = 0
df.loc[df.Gender == 2.0, 'Gender'] = 0


In [286]:
y= df['Outcome']
y.value_counts()


0    11393
1     2126
3     1509
4      538
5      252
2       11
Name: Outcome, dtype: int64

In [287]:
# drop more columns
c_df = df.copy()
X = df.drop(['Latitude', 'Longitude'], axis = 1)

In [288]:
X

Unnamed: 0,Gender,Age range,Officer-defined ethnicity,Object of search,Outcome,Type_Person and Vehicle search,Type_Person search,Type_Vehicle search,Part of a policing operation_False
0,1.0,1.0,1.0,0.0,1,0,1,0,1
1,1.0,1.0,3.0,1.0,0,1,0,0,1
3,1.0,2.0,3.0,1.0,0,0,1,0,1
4,1.0,1.0,3.0,1.0,0,1,0,0,1
5,1.0,1.0,2.0,2.0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
20366,1.0,1.0,1.0,2.0,0,1,0,0,1
20367,1.0,2.0,3.0,1.0,0,0,1,0,1
20368,1.0,1.0,3.0,1.0,0,1,0,0,1
20369,1.0,1.0,0.0,1.0,0,0,1,0,1


In [289]:
# Use train_test_split function to prepare data analysis
train_x,test_x,train_y,test_y = train_test_split(X, y, test_size=0.2, random_state = 1, stratify=y)

In [290]:
#made a copy of the target column for use in disparate imppact ratio classsification
real_test = test_x.copy()
real_test['Arrest'] = test_y
real_test.shape

(3166, 10)

In [315]:
# Priviliged group: Females (1)
# Unpriviliged group: Males (0)
female_df = real_test[real_test['Gender'] == 1]
num_of_priviliged = female_df.shape[0]
male_df = real_test[real_test['Gender'] == 0]
num_of_unpriviliged = male_df.shape[0]

In [292]:
# getting the ra
priviliged_outcomes = female_df[female_df['Arrest'] == 1].shape[0]
priviliged_ratio_female = priviliged_outcomes/num_of_priviliged
priviliged_ratio_female

0.1358991825613079

In [293]:
unpriviliged_outcomes = male_df[male_df['Arrest'] == 1].shape[0]
unpriviliged_ratio_male = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio_male

0.11304347826086956

In [294]:
priviliged_outcomes = white_df[white_df['Arrest'] == 1].shape[0]
priviliged_ratio_white = priviliged_outcomes/num_of_priviliged
priviliged_ratio_white

0.05790190735694823

In [295]:
unpriviliged_outcomes = black_df[black_df['Arrest'] == 1].shape[0]
unpriviliged_ratio_black = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio_black

0.7043478260869566

In [296]:
### Disparate impact gender

In [297]:
# Calculating disparate impact
disparate_impact = unpriviliged_ratio_male / priviliged_ratio_female
print("Disparate Impact, Gender vs. Outcome : " + str(disparate_impact))

Disparate Impact, Gender vs. Outcome : 0.8318186771276016


In [298]:
# Calculating disparate impact ratio
disparate_impact = unpriviliged_ratio_black / priviliged_ratio_white
print("Disparate Impact, Gender vs. Outcome : " + str(disparate_impact))

Disparate Impact, Gender vs. Outcome : 12.164501278772379


In [299]:
#make a model
# modelRFR = RandomForestRegressor(max_depth=2, random_state=0)

In [300]:
# modelRFR.fit(train_x, train_y)

In [301]:
# # Let's see how well it predicted with a couple values 
# y_pred = pd.Series(modelRFR.predict(test_x))
# y_test = test_y.reset_index(drop=True)
# z = pd.concat([y_test, y_pred], axis=1)
# z.columns = ['True', 'Prediction']
# z.head()
# # Predicts 5/5 correctly in this sample


In [302]:
# Instantiate and fit the RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [328]:
y_pred_test = forest.predict(test_x)
print(y_pred_test)

print("Accurary:", accuracy_score(test_y, y_pred_test ))


[0 1 3 ... 0 0 1]
Accurary: 0.9987365761212887


In [304]:
# Encode target varible Outcome as 1, N Loan_Status as 0
c_df.loc[c_df.Outcome == 1, 'Outcome'] = 1
c_df.loc[c_df.Outcome == 0, 'Outcome'] = 0
c_df.loc[c_df.Outcome == 2, 'Outcome'] = 0
c_df.loc[c_df.Outcome == 3, 'Outcome'] = 0
c_df.loc[c_df.Outcome == 4, 'Outcome'] = 0
c_df.loc[c_df.Outcome == 5, 'Outcome'] = 0
# df.loc[df.Loan_Status == 'N', 'Loan_Status'] = 0
# df

In [305]:
c_df['Outcome'].unique()

array([1, 0])

In [306]:
#c_df.columns

In [307]:
#getting dataframe into the dataset feature and lablels setting
binaryLabelDataset = aif360.datasets.StandardDataset(df=c_df, label_name='Outcome', favorable_classes=[1],
                                                     protected_attribute_names=['Gender', 'Age range', 'Officer-defined ethnicity'], 
                                                     privileged_classes=[[0], [0, 2, 3, 4], [3]])


In [308]:
df['Officer-defined ethnicity'].unique()

array([1., 3., 2., 0.])

In [271]:
# metric explainer to favourable outcome
pdata = BinaryLabelDatasetMetric(
        binaryLabelDataset,
        unprivileged_groups=[{'Gender': 1, 'Age range': 1}],
        privileged_groups=[{'Gender': 0, 'Age range':1}, {'Gender': 0, 'Age range':3}])#, {'Age range': 2}, {'Age range': 3}, {'Age range': 4}, {'Age range': 5}])
explainer = MetricTextExplainer(pdata)

print(explainer.disparate_impact())
print(explainer.statistical_parity_difference())


Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 1.0293233082706768
Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): 0.0030115830115830106


In [314]:
#metric explainer for the officer-defined ethnicity feature 
ethnic_data = BinaryLabelDatasetMetric(
        binaryLabelDataset,
        unprivileged_groups=[{'Officer-defined ethnicity': 0},{'Officer-defined ethnicity': 2},{'Officer-defined ethnicity': 1}],
        privileged_groups=[{'Officer-defined ethnicity': 3}])# {'Gender': 0, 'Age range':3}])#, {'Age range': 2}, {'Age range': 3}, {'Age range': 4}, {'Age range': 5}])
explainer = MetricTextExplainer(ethnic_data)
print(explainer.disparate_impact())
print(explainer.statistical_parity_difference())


Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 1.038273606758414
Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): 0.005030823622966724


In [194]:
#param_fit = forest.fit(train_x, train_y)
dt = modelRFR.fit(binaryLabelDataset.features, binaryLabelDataset.labels.ravel())

In [207]:
#functions that return classification metrics 
def test_generalized_entropy_index(binaryLabelDataset ):
    data = binaryLabelDataset
    #print(data)
    pred = data.copy()
    pred[[3, 9], -1] = 0
    pred[[4, 5], -1] = 1
    bld = BinaryLabelDataset(df=c_df, label_names=['Outcome'],
        protected_attribute_names=['Gender'])
    cm = ClassificationMetric(bld)
    
    assert cm.generalized_entropy_index() == 0.2

    
def test_theil_index(binaryLabelDataset ):
    data = binaryLabelDataset 
    pred = data.copy()
    
    pred[[3, 9], -1] = 0
    pred[[4, 5], -1] = 1
    bld = BinaryLabelDataset(df=c_df, label_names=['Outcome'],
        protected_attribute_names=['Age range'])
    bld2 = BinaryLabelDataset(df=c_df2, label_names=['Outcome'],
                              protected_attribute_names=['Gender'])
    cm = ClassificationMetric(bld, bld2)
    assert cm.theil_index() == 4*np.log(2)/10
    
                     
    #pred = data.copy()
#test_generalized_entropy_index(binaryLabelDataset)