**Import dependencies**

In [104]:
import numpy as np
import pandas as pd
import re

import fasttext
import fasttext.util

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

**The next 3 cells install fasttext, download english pre-trained vectors, and unzip them. we use these pretrained vectors later**

In [105]:
!pip install fasttext



In [106]:
# we cant use pretrained fasttext word vectors without crashing the collab notebook

# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.2.vec.gz
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

# !gzip -d 'cc.en.300.vec.gz'
# !gzip -d 'cc.en.300.bin.gz'

**Load data from the narratives csv and crid csv**

crid_map is a dataframe that we use to map the cr_id in a narrative to its corresponding category

In [107]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 999)
narratives_url = 'https://raw.githubusercontent.com/invinst/documentAnalysis/master/data/input/narratives.csv'
crid_map_url = 'https://gist.githubusercontent.com/simon-benigeri/42b708386d460a52d99c82a5cf891770/raw/d226a962b9978cacb3e9bde3aa16bdc12bb9a685/crid_categories.csv'
narratives = pd.read_csv(narratives_url)
crid_map = pd.read_csv(crid_map_url)

Here we create the dataframe narratives, where we have the cr_id, the column name, text, and category for each narrative

In [108]:
crid_dict = pd.Series(crid_map['category'].values, index=crid_map['crid']).to_dict()
narratives['category'] = narratives['cr_id'].apply(lambda id: crid_dict[str(id)] if str(id) in crid_dict else None)
narratives = narratives[narratives['category'].notnull()]
narratives = narratives[['cr_id', 'column_name', 'text', 'category']]
narratives.head(10)

Unnamed: 0,cr_id,column_name,text,category
0,1050727,Initial / Intake Allegation,"It is alleged that the accused officer failed to\nsecure his weapon, police |.D., FOID card\nand vest, which were placed inside his\npersonal vehicle that was then stolen at the\nabove location.",Conduct Unbecoming (Off-Duty)
1,1050727,Finding,(None Entered),Conduct Unbecoming (Off-Duty)
2,1050727,Allegation,"It is alleged by the complainant Sergeant Victoria STANEK #2012\nAssigned to the 008th District , that the ACCUSED, (Off-Duty) Police\nOfficer Ramsey WILLIAMS #9050, Assigned to Unit 012, on 16 December\n2011, in the vicinity of Hayes Park located at 2936 W 85th St, between\n1345-135Shours, failed to properly secure his weapon, namely a Sturm\nand Ruger Model MP15X, serial IMM, blue steel finish, 4.5\ninch barrel, resulting in said weapon being reported stolen under RD\n#HT634801",Conduct Unbecoming (Off-Duty)
3,1050727,Initial / Intake Allegation,"It is alleged that the accused officer failed to\nsecure his weapon, police |.D., FOID card\nand vest, which were placed inside his\npersonal vehicle that was then stolen at the\nabove location.",Conduct Unbecoming (Off-Duty)
10,1050956,Initial / Intake Allegation,"The reporting party alleges that the accused\nofficer, her child's father, filed a false case\nreport against her regarding child visitation.\nRD# HT:",Conduct Unbecoming (Off-Duty)
11,1050956,Finding,(None Entered),Conduct Unbecoming (Off-Duty)
12,1050956,Allegation,"\nP.O. Christine TAYLOR alleges that on 29 DEC 11 between 0845 hours\nand 1120 hours at ., the accused Department member\nfiled a false case report against her regarding child visitation.",Conduct Unbecoming (Off-Duty)
13,1050956,Initial / Intake Allegation,"The reporting party alleges that the accused\nofficer, her child's father, filed a false case\nreport against her regarding child visitation.\nRD#",Conduct Unbecoming (Off-Duty)
20,1051290,Initial / Intake Allegation,"The plaintiff alleges that the accused, during\na traffic stop, violently without provocation\npushed him to the ground while he was.\nhandcuffed. The plaintiff sustained a\nlaceration to his right eye and lost\nconsciousness.",Use Of Force
21,1051290,Finding,NO AFFIDAVIT,Use Of Force


**Not all rows seem to be useful.** 
Let's look into column_name and the corresponding text values

In [109]:
# create new df from the narratives
print(f"column name values are : {narratives['column_name'].unique()}")
print(f"text values for column name == Finding are : {narratives[narratives['column_name'] == 'Finding']['text'].unique()}")

column name values are : ['Initial / Intake Allegation' 'Finding' 'Allegation']
text values for column name == Finding are : ['(None Entered)' 'NO AFFIDAVIT' 'SUSTAINED' 'UNFOUNDED' 'NOT SUSTAINED'
 'EXONERATED' '(None\nEntered)' 'SuU\nSTAINED' 'NON-CPD']


**We can remove column_name == 'Finding' from the df. The information there is irrelevant to the text classification problem**

In [110]:
narratives = narratives[narratives['column_name'] != 'Finding']

**CREATE 2 DATASETS: allegations and intakes**

Given a crid, rows are not duplicates, but very similar. We selected the first instance of each group when grouping by crid to avoid having text samples that are almost the same

In [111]:
intakes = narratives[narratives['column_name'] == 'Initial / Intake Allegation'].drop(columns=['column_name'])
allegations = narratives[narratives['column_name'] == 'Allegation'].drop(columns=['column_name'])

In [112]:
intakes = intakes.groupby(['cr_id'], as_index=False).agg('first').drop_duplicates()
allegations = allegations.groupby(['cr_id'], as_index=False).agg('first').drop_duplicates()

Here we clean the text

In [113]:
def clean(s):
  # re.sub(r'[^\w\s]', '', re.sub(r'\n', ' ', s).lower())
  return re.sub(r'\s+', ' ', s, flags=re.UNICODE).lower()

In [114]:
intakes['text'] = intakes['text'].apply(clean)
allegations['text'] = allegations['text'].apply(clean)

In [115]:
intakes.head()

Unnamed: 0,cr_id,text,category
0,1048962,the victim alleges that an unknown male black off-duty officer threw her against the wall after she did not adhere to his command to get up. the victim alleges that the officer pushed her face against the wall and handcuffed her too tightly.,Use Of Force
1,1048964,"the reporting party alleges that he was a victim of a battery and when the police arrived they failed to file a report on his behalf instead, the officers gave him a ride to his friend's house.",Operation/Personnel Violations
2,1048966,"the reporting party victim stated that she telephoned 311"" regarding recovery of her stolen vehicle. the victim alleged that the responding officer never took her vehicle off the hot spot sheet which caused her boyfriend to be stoppped, handcuffed and questioned regarding the vehicle being stolen. the victim further alleged that the same officer refused to provide his name and badge number upon request.",Operation/Personnel Violations
3,1048967,it is alleged that the accused was inattentive to duty in that she failed to properly secure her weapon. it is reported that the accused went into a fitting room at carson pirie scott and left her duty weapon in the fitting room the weapon was recovered by the hammond police department.,Operation/Personnel Violations
4,1048976,the complainant alleges that the accused failed to arrest the intoxicated driver that struck her vehicle with a dui.,Traffic


In [116]:
allegations.head()

Unnamed: 0,cr_id,text,category
0,1048962,itis alleged that the accused threw bushing her face againsta wall it is alleged that the accused threw n the floor. itis alleged that .~ ras handcuffed too tight. itis alleged that the accused dragged (- the stairs. itis alleged that the acoused grabbed [ lom behind her neck,Use Of Force
1,1048977,"the reporting party / victim off duty p.o. 7027, alleged that on at 1210 hours,at he accused, after conducting a street stop on the victim, stated, ""go fuck yourself! dumb ass! your momma's a bitch!""",Traffic
2,1048978,"itis alleged that the accused pushed itis alleged that the accused said, “""shut the fuck up!""",Use Of Force
3,1048986,"itis alleaed by the complainant that the accused, nicholas goggin irently on duty disability pension, was placed under arrest or lnving while intoxicated, failing to maintain right half of roadway and no proof of insurance on 02 oct 2011 at approximately 1752hrs in wappapello missouri by deputy david richman #3218 itis alleged by the complainant sgt. joseph stehlik #1945 of unit 121, that the accused failed to cooperate in the investigation of log#1048986 in violation of general order 08-01-02 a2",Drug / Alcohol Abuse
4,1048997,the complainant alleged that the accused grabbed her wrist and twisted her arms backwards.,Domestic


Instantiate label encoder, to encode the categories

In [117]:
encoder = LabelEncoder()
encoder.fit(allegations['category'].unique())

LabelEncoder()

In [118]:
def format_labels(s):
  """
  fastext label y must be '__label__y'
  """
  return '__label__' + str(s)

Here we convert the categories to '\__label__n' where n is an integer corresponding to the category's encoding, as defined by the LabelEncoder

In [119]:
intakes['category'] = encoder.transform(intakes['category'].values)
intakes['category'] = intakes['category'].apply(format_labels)

allegations['category'] = encoder.transform(allegations['category'].values)
allegations['category'] = allegations['category'].apply(format_labels)

In [120]:
intakes.head()

Unnamed: 0,cr_id,text,category
0,1048962,the victim alleges that an unknown male black off-duty officer threw her against the wall after she did not adhere to his command to get up. the victim alleges that the officer pushed her face against the wall and handcuffed her too tightly.,__label__12
1,1048964,"the reporting party alleges that he was a victim of a battery and when the police arrived they failed to file a report on his behalf instead, the officers gave him a ride to his friend's house.",__label__9
2,1048966,"the reporting party victim stated that she telephoned 311"" regarding recovery of her stolen vehicle. the victim alleged that the responding officer never took her vehicle off the hot spot sheet which caused her boyfriend to be stoppped, handcuffed and questioned regarding the vehicle being stolen. the victim further alleged that the same officer refused to provide his name and badge number upon request.",__label__9
3,1048967,it is alleged that the accused was inattentive to duty in that she failed to properly secure her weapon. it is reported that the accused went into a fitting room at carson pirie scott and left her duty weapon in the fitting room the weapon was recovered by the hammond police department.,__label__9
4,1048976,the complainant alleges that the accused failed to arrest the intoxicated driver that struck her vehicle with a dui.,__label__11


In [121]:
allegations.head()

Unnamed: 0,cr_id,text,category
0,1048962,itis alleged that the accused threw bushing her face againsta wall it is alleged that the accused threw n the floor. itis alleged that .~ ras handcuffed too tight. itis alleged that the accused dragged (- the stairs. itis alleged that the acoused grabbed [ lom behind her neck,__label__12
1,1048977,"the reporting party / victim off duty p.o. 7027, alleged that on at 1210 hours,at he accused, after conducting a street stop on the victim, stated, ""go fuck yourself! dumb ass! your momma's a bitch!""",__label__11
2,1048978,"itis alleged that the accused pushed itis alleged that the accused said, “""shut the fuck up!""",__label__12
3,1048986,"itis alleaed by the complainant that the accused, nicholas goggin irently on duty disability pension, was placed under arrest or lnving while intoxicated, failing to maintain right half of roadway and no proof of insurance on 02 oct 2011 at approximately 1752hrs in wappapello missouri by deputy david richman #3218 itis alleged by the complainant sgt. joseph stehlik #1945 of unit 121, that the accused failed to cooperate in the investigation of log#1048986 in violation of general order 08-01-02 a2",__label__4
4,1048997,the complainant alleged that the accused grabbed her wrist and twisted her arms backwards.,__label__3


Train Test Split

In [122]:
TRAIN_intakes, test_intakes = train_test_split(intakes, test_size=0.2)
train_intakes, val_intakes = train_test_split(TRAIN_intakes, test_size=0.2)

TRAIN_allegations, test_allegations = train_test_split(allegations, test_size=0.2)
train_allegations, val_allegations = train_test_split(TRAIN_allegations, test_size=0.2)

In [123]:
# train_intakes['category'].value_counts().to_frame(name='train').join(test_intakes['category'].value_counts().to_frame(name='test')).apply(lambda row: row['test']/row['train'], axis=1)

Generate train, validation, and test CSVs for Fasttext

In [124]:
TRAIN_intakes[['category', 'text']].to_csv('intakes.TRAIN', sep='\t', header=False, index=False, encoding='utf-8')
train_intakes[['category', 'text']].to_csv('intakes.train', sep='\t', header=False, index=False, encoding='utf-8')
val_intakes[['category', 'text']].to_csv('intakes.valid', sep='\t', header=False, index=False, encoding='utf-8')
test_intakes[['category', 'text']].to_csv('intakes.test', sep='\t', header=False, index=False, encoding='utf-8')

In [125]:
TRAIN_allegations[['category', 'text']].to_csv('allegations.TRAIN', sep='\t', header=False, index=False, encoding='utf-8')
train_allegations[['category', 'text']].to_csv('allegations.train', sep='\t', header=False, index=False, encoding='utf-8')
val_allegations[['category', 'text']].to_csv('allegations.valid', sep='\t', header=False, index=False, encoding='utf-8')
test_allegations[['category', 'text']].to_csv('allegations.test', sep='\t', header=False, index=False, encoding='utf-8')

Train the models and finetune them on their respective validation sets
**these cells both run for 5 minutes.** please relax and have a coffee. and thank you for your patience

In [126]:
intakes_model = fasttext.train_supervised(input="intakes.train", autotuneValidationFile='intakes.valid')

In [127]:
allegations_model = fasttext.train_supervised(input="allegations.train", autotuneValidationFile='allegations.valid')

Get the optimized hyperparameters from the autotuned models. We will use most important hyperparameters to train the full models

In [129]:
def filter_params_dict(params_dict, remove):
  # output_dict = deepcopy(params_dict)
  for param in remove:
    del params_dict[param]
  return params_dict

In [130]:
grid_intakes = intakes_model.__dict__.copy()
grid_allegations = allegations_model.__dict__.copy()

In [131]:
remove_params = ['_labels', '_words', 'bucket', 'f', 'label', 'loss', 'pretrainedVectors']
intakes_params_dict = filter_params_dict(grid_intakes, remove_params)
allegations_params_dict = filter_params_dict(grid_allegations, remove_params)

Train full intakes and allegations models

In [132]:
intakes_model_full = fasttext.train_supervised(input="intakes.TRAIN", **intakes_params_dict)

In [133]:
allegations_model_full = fasttext.train_supervised(input="allegations.TRAIN", **allegations_params_dict)

Test full intakes and allegations models

In [134]:
intakes_model_full.test('intakes.test')

(1070, 0.6682242990654206, 0.6682242990654206)

In [135]:
allegations_model_full.test('allegations.test')

(297, 0.6127946127946128, 0.6127946127946128)

Let's get the predictions

In [137]:
def predict(model, encoder, texts):
  predictions = encoder.inverse_transform([int(model.predict(text)[0][0].replace('__label__', '')) for text in texts])
  return np.array(predictions)

In [138]:
def inverse_label_transforms(encoder, labels):
    original_labels = encoder.inverse_transform([int(label.replace('__label__', '')) for label in labels])
    return np.array(original_labels)

In [139]:
y_pred_intakes_TRAIN = predict(intakes_model_full, encoder, texts=TRAIN_intakes['text'].values)
y_pred_intakes_test = predict(intakes_model_full, encoder, texts=test_intakes['text'].values)

In [140]:
y_true_intakes_TRAIN = inverse_label_transforms(encoder, TRAIN_intakes['category'].values)
y_true_intakes_test = inverse_label_transforms(encoder, test_intakes['category'].values)

In [141]:
y_pred_allegations_TRAIN = predict(allegations_model_full, encoder, texts=TRAIN_allegations['text'].values)
y_pred_allegations_test = predict(allegations_model_full, encoder, texts=test_allegations['text'].values)

In [142]:
y_true_allegations_TRAIN = inverse_label_transforms(encoder, TRAIN_allegations['category'].values)
y_true_allegations_test = inverse_label_transforms(encoder, test_allegations['category'].values)

Make a classification report for intakes_TRAIN

In [143]:
report_intakes_TRAIN = classification_report(y_true=y_true_intakes_TRAIN, y_pred=y_pred_intakes_TRAIN, output_dict=True)
report_intakes_TRAIN = pd.DataFrame.from_dict(report_intakes_TRAIN).T
report_intakes_TRAIN['support'] = report_intakes_TRAIN['support'].astype(int)
report_intakes_TRAIN = report_intakes_TRAIN.rename(columns={col:f"train_{col}" for col in report_intakes_TRAIN.columns})

  _warn_prf(average, modifier, msg_start, len(result))


Make a classification report for intakes test

In [144]:
report_intakes_test = classification_report(y_true=y_true_intakes_test, y_pred=y_pred_intakes_test, output_dict=True)
report_intakes_test = pd.DataFrame.from_dict(report_intakes_test).T
report_intakes_test['support'] = report_intakes_test['support'].astype(int)
report_intakes_test = report_intakes_test.rename(columns={col:f"test_{col}" for col in report_intakes_test.columns})

  _warn_prf(average, modifier, msg_start, len(result))


**Join the two reports for a full classification report for intakes**

In [145]:
report_intakes = report_intakes_TRAIN.join(report_intakes_test)
report_intakes

Unnamed: 0,train_precision,train_recall,train_f1-score,train_support,test_precision,test_recall,test_f1-score,test_support
Bribery / Official Corruption,0.0,0.0,0.0,25,0.0,0.0,0.0,2
Conduct Unbecoming (Off-Duty),0.693878,0.490385,0.574648,208,0.533333,0.266667,0.355556,60
Criminal Misconduct,1.0,0.129032,0.228571,31,0.0,0.0,0.0,9
Domestic,0.0,0.0,0.0,65,0.0,0.0,0.0,17
Drug / Alcohol Abuse,1.0,0.066667,0.125,30,0.0,0.0,0.0,7
False Arrest,0.799472,0.767089,0.782946,395,0.774194,0.642857,0.702439,112
First Amendment,0.0,0.0,0.0,1,0.0,0.0,0.0,1
Illegal Search,0.802071,0.864764,0.832239,806,0.689956,0.763285,0.724771,207
Lockup Procedures,0.82153,0.810056,0.815752,358,0.534884,0.494624,0.513966,93
Operation/Personnel Violations,0.805247,0.964324,0.877635,1878,0.686515,0.871111,0.767875,450


Make a classification report for allegations Train

In [146]:
report_allegations_TRAIN = classification_report(y_true=y_true_allegations_TRAIN, y_pred=y_pred_allegations_TRAIN, output_dict=True)
report_allegations_TRAIN = pd.DataFrame.from_dict(report_allegations_TRAIN).T
report_allegations_TRAIN['support'] = report_allegations_TRAIN['support'].astype(int)
report_allegations_TRAIN = report_allegations_TRAIN.rename(columns={col:f"train_{col}" for col in report_allegations_TRAIN.columns})

  _warn_prf(average, modifier, msg_start, len(result))


Make a classification report for allegations Test

In [147]:
report_allegations_test = classification_report(y_true=y_true_allegations_test, y_pred=y_pred_allegations_test, output_dict=True)
report_allegations_test = pd.DataFrame.from_dict(report_allegations_test).T
report_allegations_test['support'] = report_allegations_test['support'].astype(int)
report_allegations_test = report_allegations_test.rename(columns={col:f"test_{col}" for col in report_allegations_test.columns})

  _warn_prf(average, modifier, msg_start, len(result))


**Join the two reports for a full classification report for allegations**

In [148]:
report_allegations = report_allegations_TRAIN.join(report_allegations_test)
report_allegations

Unnamed: 0,train_precision,train_recall,train_f1-score,train_support,test_precision,test_recall,test_f1-score,test_support
Bribery / Official Corruption,0.0,0.0,0.0,15,0.0,0.0,0.0,5.0
Conduct Unbecoming (Off-Duty),0.238095,0.17094,0.199005,117,0.56,0.5,0.528302,28.0
Criminal Misconduct,0.0,0.0,0.0,31,0.0,0.0,0.0,5.0
Domestic,0.702703,0.419355,0.525253,62,0.166667,0.083333,0.111111,12.0
Drug / Alcohol Abuse,1.0,0.12,0.214286,25,1.0,0.1,0.181818,10.0
False Arrest,0.823171,0.912162,0.865385,148,0.673913,0.861111,0.756098,36.0
First Amendment,0.0,0.0,0.0,1,,,,
Illegal Search,0.720497,0.852941,0.781145,136,0.472222,0.53125,0.5,32.0
Lockup Procedures,0.0,0.0,0.0,46,0.0,0.0,0.0,10.0
Operation/Personnel Violations,0.664384,0.944805,0.780161,308,0.608,0.853933,0.71028,89.0
