#Imports

In [48]:
import os
import base64
import joblib
import pandas as pd
import numpy as np
import category_encoders
import json
import joblib
import pickle
import math
import requests
from copy import deepcopy
import seaborn as sns
from uuid import uuid4

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

from sklearn.base import BaseEstimator, TransformerMixin

from pandas_profiling import ProfileReport

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

In [49]:
def load_train_data():
    df = pd.read_csv(os.path.join("data", "train.csv"))
    df = df.set_index('observation_id')
    df['Date'] =  pd.to_datetime(df['Date'])
    df['isnew'] = 'False'
    return df

data_train = load_train_data()

In [50]:
def load_test_data():
    df = pd.read_csv(os.path.join("data", "test_1.csv"))
    df = df.set_index('observation_id')
    df['Date'] =  pd.to_datetime(df['Date'])
    df['isnew'] = 'True'
    return df

data_test1 = load_test_data()

In [51]:
data_train.columns.tolist()

['Type',
 'Date',
 'Part of a policing operation',
 'Latitude',
 'Longitude',
 'Gender',
 'Age range',
 'Self-defined ethnicity',
 'Officer-defined ethnicity',
 'Legislation',
 'Object of search',
 'Outcome',
 'Outcome linked to object of search',
 'Removal of more than just outer clothing',
 'station',
 'Search Outcome',
 'isnew']

In [52]:
data_test1.columns.tolist()

['Type',
 'Date',
 'Part of a policing operation',
 'Latitude',
 'Longitude',
 'Gender',
 'Age range',
 'Self-defined ethnicity',
 'Officer-defined ethnicity',
 'Legislation',
 'Object of search',
 'Removal of more than just outer clothing',
 'station',
 'Search Outcome',
 'isnew']

In [53]:
frames = [data_train, data_test1]
df = pd.concat(frames)

In [54]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42, stratify=df.isnew)

In [55]:
all_features_1 = ['Type',
                'Date',
                'Part of a policing operation',
                'Gender',
                'Age range',
                'Self-defined ethnicity',
                'Officer-defined ethnicity',
                'Legislation',
                'Object of search',
                #'Outcome linked to object of search',
                'Removal of more than just outer clothing',
                'station']

cat_features_1 = ['Type',
                'Part of a policing operation',
                'Gender',
                'Age range',
                'Self-defined ethnicity',
                'Officer-defined ethnicity',
                'Legislation',
                #'Object of search',
                #'Outcome linked to object of search',
                'Removal of more than just outer clothing',
                'station']

all_features_2 = ['Type',
                'Date',
                'Part of a policing operation',
                'Officer-defined ethnicity',
                'Gender',
                'Age range',
                'Self-defined ethnicity',
                'station']

cat_features_2 = ['Type',
                'Part of a policing operation',
                'Officer-defined ethnicity',
                'Gender',
                'Age range',
                'Self-defined ethnicity',
                'station']

attempts = [{"attempt": 1, "all_features":all_features_1, "cat_features":cat_features_1},
            {"attempt": 2, "all_features":all_features_2, "cat_features":cat_features_2}]

target = 'Search Outcome'

In [56]:
def verify_no_discrimination(X_test, y_true, y_pred, sensitive_column, max_diff=0.15, min_samples=50):    
    
    sensitive_classes = X_test[sensitive_column].unique()
    
    is_satisfied = True
    f1_scores = {}
    
    for sensitive_class in sensitive_classes:
        mask = (X_test[sensitive_column] == sensitive_class)
        if mask.sum() > min_samples:
            f1_scores[sensitive_class] = f1_score(y_true[mask], y_pred[mask], pos_label=1)

    diff = np.max(list(f1_scores.values())) - np.min(list(f1_scores.values()))
    
    if diff > max_diff:
        is_satisfied = False
    
    return diff, is_satisfied

In [57]:
def feature_search(attempts, df_train, df_test):

    overall_results = []
    
    for attempt in attempts:
        
        attempt_result = {}
        
        af = attempt['all_features']
        cf = attempt['cat_features']
        
        X_test = df_test[af]
        X_test[cf] = X_test[cf].astype(str)
        y_test = df_test[target]
    
        X_train = df_train[af]
        X_train[cf] = X_train[cf].astype(str)
        y_train = df_train[target]
        
        categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
        
        preprocessor = ColumnTransformer(
        transformers=[('cat', categorical_transformer, cf )])
    
        pipeline = make_pipeline(preprocessor,
                                RandomForestClassifier(max_depth=3, min_samples_leaf=.03, class_weight="balanced", random_state=42, n_jobs=-1),
                        )
                
        pipeline.fit(X_train, y_train)
        
        y_pred = pipeline.predict(X_test)
        y_pred_train = pipeline.predict(X_train)
        
        f1_score_result = f1_score(np.array(y_test), y_pred)
        
        diff_gender, is_satisfied_gender = verify_no_discrimination(X_test, y_test, y_pred, sensitive_column='Gender',max_diff=0.15 )

        diff_race, is_satisfied_race = verify_no_discrimination(X_test, y_test, y_pred, sensitive_column='Self-defined ethnicity',max_diff=0.15 )

        diff_officer, is_satisfied_officer = verify_no_discrimination(X_test, y_test, y_pred, sensitive_column='Officer-defined ethnicity',max_diff=0.15 )
        
        attempt_result['attempt_nr'] = attempt['attempt']
        attempt_result['f1_score'] = f1_score_result
        attempt_result['gender_diff'] = diff_gender
        attempt_result['race_diff'] = diff_race
        attempt_result['officer_diff'] = diff_officer
        max_diff = max(diff_gender, diff_race)
        final_metric = f1_score_result - 0.1 * max_diff
        attempt_result['hacking_score'] = final_metric
        
        overall_results.append(attempt_result)
        
    return overall_results

In [58]:
class DroppingColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = X.drop(self.cols,axis=1)
        return X

In [59]:
feature_search(attempts, df_train, df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


[{'attempt_nr': 1,
  'f1_score': 0.38777292576419214,
  'gender_diff': 0.03577782332445162,
  'race_diff': 0.5,
  'officer_diff': 0.1415343915343915,
  'hacking_score': 0.33777292576419216},
 {'attempt_nr': 2,
  'f1_score': 0.40589765828274066,
  'gender_diff': 0.035156249999999944,
  'race_diff': 0.4761904761904762,
  'officer_diff': 0.08142239946826191,
  'hacking_score': 0.35827861066369304}]

In [61]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        #('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, cat_features)])

In [67]:
target = 'Search Outcome'

af = ['Type',
      'Date',
      'Part of a policing operation',
      'Latitude',
      'Longitude',
      'Gender',
      'Age range',
      'Self-defined ethnicity',
      'Officer-defined ethnicity',
      'Legislation',
      'Object of search',
      'Removal of more than just outer clothing',
      'station']

cols_to_drop = ['Latitude',
                'Longitude']

cf = ['Type',
      'Date',
      'Part of a policing operation',
      'Gender',
      'Age range',
      'Officer-defined ethnicity',
      'Legislation',
      'Object of search',
      'Removal of more than just outer clothing',
      'station']


pipeline = make_pipeline(
X_test = df_test[af]
X_test[cf] = X_test[[cf].astype(str)
y_test = df_test[target]
    
X_train = df_train[af]
X_train[cf] = X_train[cf].astype(str)
y_train = df_train[target]
        
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
        
preprocessor = ColumnTransformer(
transformers=[('cat', categorical_transformer, cf )])
    
pipeline = make_pipeline(preprocessor,
                        RandomForestClassifier(max_depth=3, min_samples_leaf=.03, class_weight="balanced", random_state=42, n_jobs=-1),
                )
                
pipeline.fit(X_train, y_train)
        
y_pred = pipeline.predict(X_test)
y_pred_train = pipeline.predict(X_train)
        
f1_score(np.array(y_test), y_pred)

SyntaxError: invalid syntax (<ipython-input-67-fea73804cb2b>, line 34)