In [18]:
pip install h2o




In [32]:
tf.__version__

In [33]:
import tensorflow as tf

from tensorflow.keras.layers import Dense
from tensorflow.keras import Model

In [35]:
import pandas as pd
import numpy as np
from h2o.automl import H2OAutoML
import matplotlib.pyplot as plt

In [26]:
# !pip install keras



In [36]:
NIJ_Training_df= pd.read_csv("NIJ_s_Recidivism_Challenge_Training_Dataset.csv")

In [37]:
NIJ_Training_df.columns

Index(['ID', 'Gender', 'Race', 'Age_at_Release', 'Residence_PUMA',
       'Gang_Affiliated', 'Supervision_Risk_Score_First',
       'Supervision_Level_First', 'Education_Level', 'Dependents',
       'Prison_Offense', 'Prison_Years', 'Prior_Arrest_Episodes_Felony',
       'Prior_Arrest_Episodes_Misd', 'Prior_Arrest_Episodes_Violent',
       'Prior_Arrest_Episodes_Property', 'Prior_Arrest_Episodes_Drug', '_v1',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug', '_v2', '_v3', '_v4',
       'Prior_Revocations_Parole', 'Prior_Revocations_Probation',
       'Condition_MH_SA', 'Condition_Cog_Ed', 'Condition_Other',
       'Violations_ElectronicMonitoring', 'Violations_Instruction',
       'Violations_FailToReport', 'Violations_MoveWithoutPermission',
       'Delinquency_Report

In [38]:
x_train =NIJ_Training_df[NIJ_Training_df.columns[0:-4]]
x_train.columns

Index(['ID', 'Gender', 'Race', 'Age_at_Release', 'Residence_PUMA',
       'Gang_Affiliated', 'Supervision_Risk_Score_First',
       'Supervision_Level_First', 'Education_Level', 'Dependents',
       'Prison_Offense', 'Prison_Years', 'Prior_Arrest_Episodes_Felony',
       'Prior_Arrest_Episodes_Misd', 'Prior_Arrest_Episodes_Violent',
       'Prior_Arrest_Episodes_Property', 'Prior_Arrest_Episodes_Drug', '_v1',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug', '_v2', '_v3', '_v4',
       'Prior_Revocations_Parole', 'Prior_Revocations_Probation',
       'Condition_MH_SA', 'Condition_Cog_Ed', 'Condition_Other',
       'Violations_ElectronicMonitoring', 'Violations_Instruction',
       'Violations_FailToReport', 'Violations_MoveWithoutPermission',
       'Delinquency_Report

In [39]:
# Number of records having null Values in each column
x_train.isnull().sum()

ID                                     0
Gender                                 0
Race                                   0
Age_at_Release                         0
Residence_PUMA                         0
Gang_Affiliated                     2217
Supervision_Risk_Score_First         330
Supervision_Level_First             1212
Education_Level                        0
Dependents                             0
Prison_Offense                      2321
Prison_Years                           0
Prior_Arrest_Episodes_Felony           0
Prior_Arrest_Episodes_Misd             0
Prior_Arrest_Episodes_Violent          0
Prior_Arrest_Episodes_Property         0
Prior_Arrest_Episodes_Drug             0
_v1                                    0
Prior_Arrest_Episodes_DVCharges        0
Prior_Arrest_Episodes_GunCharges       0
Prior_Conviction_Episodes_Felony       0
Prior_Conviction_Episodes_Misd         0
Prior_Conviction_Episodes_Viol         0
Prior_Conviction_Episodes_Prop         0
Prior_Conviction

In [40]:
# Total Number of Records in a Dataset with null values in any column
x_train.isna().any(axis=1).sum()

8190

In [41]:
# Find the Categoricorical columns for feature engineering(get Dummies)
cat_columns=[col for col in x_train.columns if x_train[col].dtypes=='O']
x_train[cat_columns].isnull().sum()

Gender                                 0
Race                                   0
Age_at_Release                         0
Gang_Affiliated                     2217
Supervision_Level_First             1212
Education_Level                        0
Dependents                             0
Prison_Offense                      2321
Prison_Years                           0
Prior_Arrest_Episodes_Felony           0
Prior_Arrest_Episodes_Misd             0
Prior_Arrest_Episodes_Violent          0
Prior_Arrest_Episodes_Property         0
Prior_Arrest_Episodes_Drug             0
_v1                                    0
Prior_Conviction_Episodes_Felony       0
Prior_Conviction_Episodes_Misd         0
Prior_Conviction_Episodes_Prop         0
Prior_Conviction_Episodes_Drug         0
Delinquency_Reports                    0
Program_Attendances                    0
Program_UnexcusedAbsences              0
Residence_Changes                      0
dtype: int64

In [42]:
x_train[cat_columns].isna().any(axis=1).sum()

5211

In [43]:
cat_variables = x_train[cat_columns]
# print(cat_variables)
cat_dummies = pd.get_dummies(cat_variables,dummy_na=True)
print(cat_dummies.head())

   Gender_F  Gender_M  Gender_nan  Race_BLACK  Race_WHITE  Race_nan  \
0         0         1           0           1           0         0   
1         0         1           0           1           0         0   
2         0         1           0           1           0         0   
3         0         1           0           0           1         0   
4         0         1           0           0           1         0   

   Age_at_Release_18-22  Age_at_Release_23-27  Age_at_Release_28-32  \
0                     0                     0                     0   
1                     0                     0                     0   
2                     0                     0                     0   
3                     0                     0                     0   
4                     0                     0                     0   

   Age_at_Release_33-37  ...  Program_UnexcusedAbsences_0  \
0                     0  ...                            1   
1                     1 

In [44]:
x_train.columns

Index(['ID', 'Gender', 'Race', 'Age_at_Release', 'Residence_PUMA',
       'Gang_Affiliated', 'Supervision_Risk_Score_First',
       'Supervision_Level_First', 'Education_Level', 'Dependents',
       'Prison_Offense', 'Prison_Years', 'Prior_Arrest_Episodes_Felony',
       'Prior_Arrest_Episodes_Misd', 'Prior_Arrest_Episodes_Violent',
       'Prior_Arrest_Episodes_Property', 'Prior_Arrest_Episodes_Drug', '_v1',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Felony', 'Prior_Conviction_Episodes_Misd',
       'Prior_Conviction_Episodes_Viol', 'Prior_Conviction_Episodes_Prop',
       'Prior_Conviction_Episodes_Drug', '_v2', '_v3', '_v4',
       'Prior_Revocations_Parole', 'Prior_Revocations_Probation',
       'Condition_MH_SA', 'Condition_Cog_Ed', 'Condition_Other',
       'Violations_ElectronicMonitoring', 'Violations_Instruction',
       'Violations_FailToReport', 'Violations_MoveWithoutPermission',
       'Delinquency_Report

In [45]:
x_train = x_train.drop(cat_columns, axis=1)

In [46]:
x_train.columns

Index(['ID', 'Residence_PUMA', 'Supervision_Risk_Score_First',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Viol', '_v2', '_v3', '_v4',
       'Prior_Revocations_Parole', 'Prior_Revocations_Probation',
       'Condition_MH_SA', 'Condition_Cog_Ed', 'Condition_Other',
       'Violations_ElectronicMonitoring', 'Violations_Instruction',
       'Violations_FailToReport', 'Violations_MoveWithoutPermission',
       'Avg_Days_per_DrugTest', 'DrugTests_THC_Positive',
       'DrugTests_Cocaine_Positive', 'DrugTests_Meth_Positive',
       'DrugTests_Other_Positive', 'Percent_Days_Employed', 'Jobs_Per_Year',
       'Employment_Exempt'],
      dtype='object')

8 - null
get dummies, dummy_na - gang_affiliated _false=np.null, gangaffliated_true= np.nullgang_affiliated_nan=1
drop ganga_affliated_nan

In [47]:
x_train = pd.concat([x_train, cat_dummies], axis=1)
print(x_train.columns)

Index(['ID', 'Residence_PUMA', 'Supervision_Risk_Score_First',
       'Prior_Arrest_Episodes_DVCharges', 'Prior_Arrest_Episodes_GunCharges',
       'Prior_Conviction_Episodes_Viol', '_v2', '_v3', '_v4',
       'Prior_Revocations_Parole',
       ...
       'Program_UnexcusedAbsences_0', 'Program_UnexcusedAbsences_1',
       'Program_UnexcusedAbsences_2', 'Program_UnexcusedAbsences_3 or more',
       'Program_UnexcusedAbsences_nan', 'Residence_Changes_0',
       'Residence_Changes_1', 'Residence_Changes_2',
       'Residence_Changes_3 or more', 'Residence_Changes_nan'],
      dtype='object', length=161)


In [48]:
x_train.loc[x_train.Gang_Affiliated_nan == 1, ["Gang_Affiliated_False", "Gang_Affiliated_True"]] = np.nan
# x_train.drop('Gang_Affiliated_nan',axis=1)


In [49]:
x_train.loc[x_train.Supervision_Level_First_nan == 1, ["Supervision_Level_First_High", "Supervision_Level_First_Specialized","Supervision_Level_First_Standard"]] = np.nan
# x_train.drop('Supervision_Level_First_nan',axis=1)

In [50]:
Prior_Arrest_Episodes_PPViolationCharges_nanx_train.loc[x_train.Prison_Offense_nan == 1, ["Prison_Offense_Drug", "Prison_Offense_Other","Prison_Offense_Property","Prison_Offense_Violent/Non-Sex","Prison_Offense_Violent/Sex"]] = np.nan
# x_train.drop('Prison_Offense_nan',axis=1)

NameError: name 'Prior_Arrest_Episodes_PPViolationCharges_nanx_train' is not defined

In [51]:
x_train.drop(['Gender_nan','Race_nan','Age_at_Release_nan','Gang_Affiliated_nan','Supervision_Level_First_nan','Education_Level_nan','Dependents_nan','Prison_Offense_nan','Prison_Years_nan','Prior_Arrest_Episodes_Felony_nan','Prior_Arrest_Episodes_Misd_nan','Prior_Arrest_Episodes_Violent_nan','Prior_Arrest_Episodes_Property_nan','Prior_Arrest_Episodes_Drug_nan','Prior_Conviction_Episodes_Felony_nan','Prior_Conviction_Episodes_Misd_nan','Prior_Conviction_Episodes_Prop_nan','Prior_Conviction_Episodes_Drug_nan','Delinquency_Reports_nan','Program_Attendances_nan','Program_UnexcusedAbsences_nan','Residence_Changes_nan'],axis=1,inplace=True)

In [52]:
x_train_nan_values = x_train[x_train.isna().any(axis=1)]
len(x_train_nan_values)

6693

In [53]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10)
x_train = pd.DataFrame(imputer.fit_transform(x_train),columns = x_train.columns)

ImportError: cannot import name 'KNNImputer' from 'sklearn.impute' (c:\users\sshir\anaconda3\envs\mytfenv\lib\site-packages\sklearn\impute\__init__.py)

In [29]:
x_train.to_excel (r'C:\Users\sshir\OneDrive - University of Texas at San Antonio\MSDA\Recividism forcasting challenge\X_Train_imputed.xlsx', index = False, header=True)

In [23]:
x_train.isna().any()

ID                                     False
Residence_PUMA                         False
Supervision_Risk_Score_First           False
Prior_Arrest_Episodes_DVCharges        False
Prior_Arrest_Episodes_GunCharges       False
                                       ...  
Program_UnexcusedAbsences_3 or more    False
Residence_Changes_0                    False
Residence_Changes_1                    False
Residence_Changes_2                    False
Residence_Changes_3 or more            False
Length: 139, dtype: bool

In [24]:
x_train.isna().sum().sum()

0

In [36]:
print(x_train.shape,y_train.shape)


(18028, 139) (18028,)


In [33]:
NIJ_Testing_df=pd.read_csv("NIJ_s_Recidivism_Challenge_Test_Dataset1.csv")

In [34]:
y_train=NIJ_Training_df[NIJ_Training_df.columns[-3]]*1
x_test=NIJ_Testing_df[NIJ_Testing_df.columns[1:]]

In [40]:
print(x_train.shape,y_train.shape)


(18028, 139) (18028,)


In [41]:
# x_test = pd.DataFrame(imputer.fit_transform(x_train),columns = x_test.columns)

In [42]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import keras
import matplotlib.pyplot as plt
import numpy as np

ModuleNotFoundError: No module named 'tensorflow'

In [27]:
model = models.Sequential()
model.add(layers.Dense(100, input_dim=x_train.shape[1], activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

NameError: name 'models' is not defined

In [None]:
batch_size = 128
epochs = 100

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [None]:
# make probability predictions with the model
predictions = model.predict(X_test)
predictions

In [None]:
Accuracy = model.evaluate(X_train, Y_train)