# Using Machine Learning to Make Predictions

In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
wapo = pd.read_csv("https://raw.githubusercontent.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv")

In [4]:
wapo.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


It seems like "armed" is a good category to predict

In [5]:
wapo.armed.value_counts().head(10)

gun               2016
knife              522
unarmed            247
undetermined       146
toy weapon         133
unknown weapon      41
machete             32
Taser               14
ax                  14
sword               13
Name: armed, dtype: int64

In [6]:
wapo.armed = wapo.armed.apply(lambda x: "armed" if x != "unarmed" and x != "undetermined" else x)

In [24]:
wapo.armed.value_counts()

armed           3218
unarmed          247
undetermined     146
Name: armed, dtype: int64

The baseline accuracy to beat is 89% (if the algorithm guessed armed every time)

In [25]:
wapo.threat_level.value_counts()

attack          2260
other           1165
undetermined     186
Name: threat_level, dtype: int64

## Guess if a person is armed from gender, race, and age

### Decision Tree

In [15]:
from sklearn import tree

In [16]:
wapo_tree = wapo.dropna(subset=['age'])
X_train = pd.get_dummies(wapo_tree[['gender', 'race', 'age']])
X_train.drop(['gender_F', 'race_A'], axis=1, inplace=True)
Y_train = wapo_tree['armed']

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2)

In [18]:
clf = tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf=3)
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
predictions = clf.predict(x_test)
clf.score(x_test, y_test)

0.897841726618705

In [20]:
pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted:'], margins=True)

Predicted:,armed,unarmed,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
armed,623,2,625
unarmed,35,1,36
undetermined,34,0,34
All,692,3,695


In [21]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render('decision_tree')

'decision_tree.pdf'