# Import Libraries and the Data

In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

#import spacy
#import numpy as np
#import string
#import matplotlib.pyplot as plt
#import seaborn as snsa
#import nltk

from sklearn.preprocessing import RobustScaler


# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.model_selection import train_test_split
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import f1_score


In [4]:
path = 'data'
filename = 'two-years-recid.csv'

df = pd.read_csv(os.path.join(path, filename))

# Make the classes
(to edit the dataframe)

In [5]:
############################################################
# Generic changes
############################################################

# Set id as index
class SetIndex(BaseEstimator, TransformerMixin):
    def __init__(self, i_col):
        self.i_col = i_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # assuming X is a DataFrame
        X = X.set_index('id')
        return X


In [6]:
############################################################
# Create new features
############################################################

# Jail time
class JailTime(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # assuming X is a DataFramed
        X['c_jail_out'] = pd.to_datetime(X['c_jail_out'])
        X['c_jail_in'] = pd.to_datetime(X['c_jail_in'])
        
        X['jail_time'] = (X['c_jail_out'] - X['c_jail_in']).dt.days + 1
        return X    

# Age jailed
class AgeJailed(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['dob'] = pd.to_datetime(X['dob'])
        X['c_jail_in'] = pd.to_datetime(X['c_jail_in'])
        
        X['age_jailed'] = ( (X['c_jail_in'] - X['dob']).dt.days + 1 ) // 365.25
        return X
    


In [7]:
############################################################
# Generic changes
############################################################

# Drop ununsed columns
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # assuming X is a DataFrame
        X = X.drop(columns=self.columns)
        return X

# Make the pipelines

In [8]:
############################################################
# Generic changes
############################################################

index_column = 'id'
index_pl = Pipeline([
    ('set_index', SetIndex(index_column))
])


In [9]:
############################################################
# Create new features
############################################################

jail_time_pl = Pipeline([
    ('jail_time', JailTime())
])

age_jailed_pl = Pipeline([
    ('age_jailed', AgeJailed())
])

In [10]:
############################################################
# Generic changes
############################################################

columns_to_drop = [
    'name',                   
    'dob',
    'type_of_assessment', 
    'v_type_of_assessment', 
    'c_arrest_date', 
    'c_offense_date', 
    'c_jail_in', 
    'c_jail_out'
]
drop_pl = Pipeline([
    ('drop_columns', DropColumns(columns_to_drop))
])


In [11]:
pipeline = Pipeline([
    ('index', index_pl),
    ()
])

In [18]:
df.columns

Index(['id', 'name', 'sex', 'dob', 'age', 'race', 'c_jail_in', 'c_jail_out',
       'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_charge_degree',
       'c_charge_desc', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
       'priors_count', 'compas_screening_date', 'type_of_assessment',
       'decile_score', 'score_text', 'v_type_of_assessment', 'v_decile_score',
       'v_score_text', 'is_recid', 'r_case_number', 'r_offense_date',
       'r_charge_degree', 'r_charge_desc', 'is_violent_recid',
       'vr_case_number', 'vr_offense_date', 'vr_charge_degree',
       'vr_charge_desc', 'two_year_recid'],
      dtype='object')

In [None]:
numerical_features = ['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
       'priors_count', 'decile_score', 'v_decile_score']

In [None]:
categoical features = 

In [13]:
df 

Unnamed: 0,id,name,sex,dob,age,race,c_jail_in,c_jail_out,c_case_number,c_offense_date,...,r_case_number,r_offense_date,r_charge_degree,r_charge_desc,is_violent_recid,vr_case_number,vr_offense_date,vr_charge_degree,vr_charge_desc,two_year_recid
0,1,steven lux,Male,1953-06-15,62,Caucasian,2013-01-05 04:35:31,2013-01-07 03:18:03,13000208CF10A,2013-01-05,...,,,,,0,,,,,0
1,2,andre small,Male,1987-10-01,28,African-American,2013-04-02 12:18:46,2013-04-04 07:54:22,13006354MM10A,2013-04-02,...,,,,,0,,,,,0
2,3,willie gray,Male,1959-01-12,57,African-American,2014-10-31 12:02:01,2014-10-31 01:47:05,14040148MU10A,2014-10-30,...,15043364TC20A,2015-07-23,(M2),Driving License Suspended,0,,,,,1
3,4,nickson marcellus,Male,1996-07-11,19,African-American,2014-01-23 03:19:30,2014-01-23 01:04:34,13017969CF10A,,...,16000241MM20A,2016-01-04,(M1),Possess Cannabis/20 Grams Or Less,0,,,,,1
4,5,patria barnes,Female,1978-06-06,37,Other,2013-12-08 01:55:28,2013-12-09 02:00:59,13022717MM10A,2013-12-07,...,,,,,0,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6109,6110,seccunda davis,Male,1987-05-13,28,African-American,2013-08-18 07:25:24,2013-08-19 09:01:42,13015644MM10A,2013-08-18,...,,,,,0,,,,,0
6110,6111,mark montgomery,Male,1985-11-03,30,African-American,2013-03-23 01:32:34,2013-03-28 09:37:27,13005696MM10A,2013-03-23,...,,,,,0,,,,,0
6111,6112,erica johnson,Female,1982-06-23,33,Caucasian,2013-09-29 09:25:30,2013-09-30 09:59:37,13013661CF10A,2013-09-29,...,,,,,0,,,,,0
6112,6113,barry williams,Male,1988-04-22,27,African-American,2013-10-30 03:40:14,2013-12-07 01:53:45,13004112MM10A,,...,,,,,0,,,,,0


## To do

choose numerical features: 
 - run them through an inputer (median???)
 - run them through robust scaller
 
choose categorical features: 
 - run them through an inputer????
 - run them through OneHotEncoder
 
choose features that go through to the training model

fit a logitical regression

try other methodes

imputer for jail time: median
