#### Loading the Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU, Activation, Dropout, BatchNormalization

In [None]:
# mount gdrive
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# change root to the necessary path
root = "/gdrive/MyDrive/ads_proj4/ads-spring-2022-prj4-group-11-1/"
outputs_dir = root + "output/"

In [None]:
assert os.path.exists(root), 'Check the path to your root directory'
assert os.path.exists(outputs_dir), 'Check the path to your outputs directory'

Data Cleaning

In [None]:
%load_ext rpy2.ipython
import warnings
warnings.filterwarnings('ignore')

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [None]:
%%R
library(dplyr)
raw_data <- read.csv("/gdrive/MyDrive/ads_proj4/ads-spring-2022-prj4-group-11-1/data/compas-scores-two-years.csv")
nrow(raw_data)

[1] 7214


In [None]:
%%R
df <- dplyr::select(raw_data, age, c_charge_degree, race, age_cat, score_text, sex, priors_count, 
                    days_b_screening_arrest, decile_score, is_recid, two_year_recid, c_jail_in, c_jail_out,
                    juv_fel_count, juv_misd_count, juv_other_count) %>% 
        filter(days_b_screening_arrest <= 30) %>%
        filter(days_b_screening_arrest >= -30) %>%
        filter(is_recid != -1) %>%
        filter(c_charge_degree != "O") %>%
        filter(score_text != 'N/A')

In [None]:
%%R
# filter groups other than African-American and Caucasian
df <- df[(df$race=='African-American') | (df$race=='Caucasian'),] 
# exchange race to dummy variable
df$race <- ifelse(df$race=='African-American', 0, 1)
# categorize age into 3 levels 
# 0: < 25; 1: 25 - 45, 2: > 45
df$age_cat <- ifelse(df$age<25, 0, ifelse(25<=df$age & df$age<=45, 1, 2)) 
# calculate and categorize length of stay into 3 levels
# 0: ≤ 1 week; 1: 1 week < length ≤ 3 months; 2: r > 3 months 
df$length_of_stay <- difftime(df$c_jail_out, df$c_jail_in, units = "days") 
df$length_of_stay <- ifelse(df$length_of_stay<=7, 0, ifelse(7<df$length_of_stay & df$length_of_stay<=90, 1, 2)) 
# categorize priors count into 3 levels 
# 0: 0; 1: 1-3; 2: >3
df$priors_count <- ifelse(df$priors_count==0, 0, ifelse(1<=df$length_of_stay & df$length_of_stay<=3, 1, 2))

In [None]:
%%R
ffs_df <- dplyr::select(df, age_cat, c_charge_degree, race, sex, priors_count, 
                    length_of_stay, two_year_recid)
ffs_df_v2 <- dplyr::select(df, age, c_charge_degree, race, sex, priors_count, 
                    length_of_stay, two_year_recid)
ffs_df_v3 <- dplyr::select(df, age, c_charge_degree, race, sex, priors_count, 
                    length_of_stay, two_year_recid, juv_fel_count, juv_misd_count, 
                    juv_other_count)
head(ffs_df,5)

  age_cat c_charge_degree race    sex priors_count length_of_stay
2       1               F    0   Male            0              1
3       0               F    0   Male            2              0
5       1               F    1   Male            2              0
7       1               M    1 Female            0              0
8       1               F    1   Male            0              0
  two_year_recid
2              1
3              1
5              1
7              0
8              0


In [None]:
# Copy Dataframes from R to Python
%R -o ffs_df
%R -o ffs_df_v2
%R -o ffs_df_v3

In [None]:
# Build data output
ffs_df.to_csv("/gdrive/MyDrive/ads_proj4/ads-spring-2022-prj4-group-11-1/output/ffs_data.csv", index=False)
ffs_df_v2.to_csv("/gdrive/MyDrive/ads_proj4/ads-spring-2022-prj4-group-11-1/output/ffs_data_v2.csv", index=False)
ffs_df_v3.to_csv("/gdrive/MyDrive/ads_proj4/ads-spring-2022-prj4-group-11-1/output/ffs_data_v3.csv", index=False)