In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objs as go
import plotly.plotly as py
from sklearn import preprocessing, manifold, cluster

In [2]:
# Seed random number generator for consistent t-SNE visualizations
np.random.seed(2018)

# Set Plotly credentials (should probably use credentials file instead of hard-coding)
plotly.tools.set_credentials_file(username='navravi', api_key='uYbUJOZ3VYmK0YILm3Gq')

In [3]:
# Read in raw data
df_raw = pd.read_csv('compas-scores-two-years.csv',
                     usecols=['age', 'c_charge_degree', 'race', 'score_text',
                              'sex', 'priors_count', 'days_b_screening_arrest',
                              'decile_score', 'is_recid', 'two_year_recid',
                              'c_jail_in', 'c_jail_out', 'c_charge_desc'],
                     parse_dates=['c_jail_in', 'c_jail_out'])

# Filter data
df_raw = df_raw[df_raw['days_b_screening_arrest'] <= 30]
df_raw = df_raw[df_raw['days_b_screening_arrest'] >= -30]
df_raw = df_raw[df_raw['is_recid'] != -1]
df_raw = df_raw[df_raw['c_charge_degree'] != 'O']
df_raw = df_raw[df_raw['score_text'] != 'N/A']

# Calculate time in jail
df_raw.loc[:, 'c_jail_time'] = df_raw['c_jail_out'] - df_raw['c_jail_in']
                
df_raw.head()

Unnamed: 0,sex,age,race,decile_score,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_charge_degree,c_charge_desc,is_recid,score_text,two_year_recid,c_jail_time
0,Male,69,Other,1,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,F,Aggravated Assault w/Firearm,0,Low,0,0 days 23:37:38
1,Male,34,African-American,3,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,F,Felony Battery w/Prior Convict,1,Low,1,10 days 01:51:26
2,Male,24,African-American,4,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,F,Possession of Cocaine,1,Low,1,1 days 02:03:30
5,Male,44,Other,1,0,0.0,2013-11-30 04:50:18,2013-12-01 12:28:56,M,Battery,0,Low,0,1 days 07:38:38
6,Male,41,Caucasian,6,14,-1.0,2014-02-18 05:08:24,2014-02-24 12:18:30,F,Possession Burglary Tools,1,Medium,1,6 days 07:10:06


In [4]:
# Choose subset of columns for analysis
df = df_raw[['sex', 'age', 'priors_count', 'c_jail_time', 'c_charge_degree',
             'two_year_recid']]
df = df.copy()

# Convert time in jail to seconds
df['c_jail_time'] = df['c_jail_time'].dt.total_seconds()

# Convert sex to numeric value
df['sex'].replace('Female', 0, inplace=True)
df['sex'].replace('Male', 1, inplace=True)

# Convert charge degree to numeric value
df['c_charge_degree'].replace('F', 0, inplace=True)
df['c_charge_degree'].replace('M', 1, inplace=True)

df.head()

Unnamed: 0,sex,age,priors_count,c_jail_time,c_charge_degree,two_year_recid
0,1,69,0,85058.0,0,0
1,1,34,0,870686.0,0,1
2,1,24,4,93810.0,0,1
5,1,44,0,113918.0,1,0
6,1,41,14,544206.0,0,1


In [5]:
# Scale features so they have zero mean and unit variance
X = preprocessing.scale(df)
X.shape

(6172, 6)

In [6]:
# Embed data in 2 dimensions using t-SNE for visualization
tsne = manifold.TSNE()
X_tsne = tsne.fit_transform(X)

In [7]:
# Divide data into clusters based on Euclidean distance
kmeans = cluster.KMeans(2)
clusters = kmeans.fit_predict(X)

In [8]:
def make_label(row):
    '''Create label for given row of data.'''
    return 'score: {}, crime: {}'.format(row['decile_score'], row['c_charge_desc'])

In [9]:
# Visualize clusters
trace = go.Scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], mode='markers',
                   text=df_raw.apply(make_label, axis=1),
                   marker={'color': clusters, 'colorscale': 'Picnic'})
fig = {'data': [trace], 'layout': {'title': 't-SNE Visualization of Clusters'}}
py.iplot(fig, filename='tsne-clusters')

In [10]:
# Visualize COMPAS scores
trace = go.Scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], mode='markers',
                   text=df_raw.apply(make_label, axis=1),
                   marker={'color': df_raw['decile_score'], 'colorscale': 'Viridis'})
fig = {'data': [trace], 'layout': {'title': 't-SNE Visualization of COMPAS Scores'}}
py.iplot(fig, filename='tsne-scores')