# Imports

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE

# Data

In [6]:
first_counter_OHE = pd.read_csv("../data/processed/first_encounter_OHE.csv")
first_counter_OHE.sample(5)

Unnamed: 0,number_diagnoses_clip,number_outpatient_log1p,number_emergency_log1p,number_inpatient_log1p,num_procedures_log1p,num_medications_log1p,num_lab_procedures_log1p,norm_time_in_hospital,readmitted_rescaled,race_AfricanAmerican,...,grouped_diag_3_E8,grouped_diag_3_E9,grouped_diag_3_V0,grouped_diag_3_V1,grouped_diag_3_V2,grouped_diag_3_V4,grouped_diag_3_V5,grouped_diag_3_V6,grouped_diag_3_V7,grouped_diag_3_V8
60882,4,0.0,0.0,0.0,0.0,1.609438,1.386294,-1.115266,0,0,...,0,0,0,0,0,0,0,0,0,0
62169,9,0.0,0.0,0.0,0.693147,2.397895,2.639057,-0.437113,0,1,...,0,0,0,0,0,0,0,0,0,0
63416,9,0.0,0.0,0.0,0.693147,2.772589,3.713572,-0.437113,0,0,...,0,0,0,0,0,0,0,0,0,0
69138,9,1.098612,0.0,0.0,0.0,2.484907,1.386294,-0.098037,1,0,...,0,0,0,0,0,0,0,0,0,0
37191,9,0.0,0.0,0.0,0.693147,3.178054,4.174387,-0.437113,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
first_counter_OHE.columns.tolist()

['number_diagnoses_clip',
 'number_outpatient_log1p',
 'number_emergency_log1p',
 'number_inpatient_log1p',
 'num_procedures_log1p',
 'num_medications_log1p',
 'num_lab_procedures_log1p',
 'norm_time_in_hospital',
 'readmitted_rescaled',
 'race_AfricanAmerican',
 'race_Asian',
 'race_Caucasian',
 'race_Hispanic',
 'race_Other',
 'gender_Male',
 'gender_Unknown/Invalid',
 'age_[10-20)',
 'age_[20-30)',
 'age_[30-40)',
 'age_[40-50)',
 'age_[50-60)',
 'age_[60-70)',
 'age_[70-80)',
 'age_[80-90)',
 'age_[90-100)',
 'metformin_No',
 'metformin_Steady',
 'metformin_Up',
 'repaglinide_No',
 'repaglinide_Steady',
 'repaglinide_Up',
 'nateglinide_No',
 'nateglinide_Steady',
 'nateglinide_Up',
 'chlorpropamide_No',
 'chlorpropamide_Steady',
 'chlorpropamide_Up',
 'glimepiride_No',
 'glimepiride_Steady',
 'glimepiride_Up',
 'glipizide_No',
 'glipizide_Steady',
 'glipizide_Up',
 'glyburide_No',
 'glyburide_Steady',
 'glyburide_Up',
 'tolbutamide_Steady',
 'pioglitazone_No',
 'pioglitazone_Steady

# Examine the class label imbalance

In [9]:
neg, pos = np.bincount(first_counter_OHE['readmitted_rescaled'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 70434
    Positive: 6293 (8.93% of total)



# Split Data

In [11]:
# Use a utility from sklearn to split and shuffle our dataset.
train_df, test_df = train_test_split(first_counter_OHE, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)

In [12]:
print(test_df.shape)
print(train_df.shape)
print(val_df.shape)

(14087, 382)
(45077, 382)
(11270, 382)


# Distribution of Data

In [37]:
neg, pos = np.bincount(train_df['readmitted_rescaled'])
total = neg + pos
print('Train Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

neg, pos = np.bincount(val_df['readmitted_rescaled'])
total = neg + pos
print('Validation Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))


neg, pos = np.bincount(test_df['readmitted_rescaled'])
total = neg + pos
print('Test Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Train Examples:
    Total: 45077
    Positive: 4042 (8.97% of total)

Validation Examples:
    Total: 11270
    Positive: 961 (8.53% of total)

Test Examples:
    Total: 14087
    Positive: 1290 (9.16% of total)



In [39]:
val_df.to_csv("../data/processed/validation.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

# Upsampling-SMOTE

In [16]:
y_train = train_df["readmitted_rescaled"]
X_train = train_df.drop("readmitted_rescaled", axis=1)

In [25]:
# Instantiate SMOTE and fit
os = SMOTE(random_state=0)
columns = X_train.columns

data_X_smote, data_y_smote = os.fit_sample(X_train, y_train)
data_X_smote = pd.DataFrame(data=data_X_smote, columns=columns)
data_y_smote = pd.DataFrame(data=data_y_smote, columns=['readmitted_rescaled'])

# Check the numbers of our data
print("Length of oversampled data is ", len(data_X_smote))
print("Number of no readmissions in oversampled data",
      len(data_y_smote[data_y_smote['readmitted_rescaled'] == 0]))
print("Number of readmissions", len(
    data_y_smote[data_y_smote['readmitted_rescaled'] == 1]))
print("Proportion of no readmissions data in oversampled data is ", len(
    data_y_smote[data_y_smote['readmitted_rescaled'] == 0])/len(data_y_smote))
print("Proportion of readmissions data in oversampled data is ", len(
    data_y_smote[data_y_smote['readmitted_rescaled'] == 1])/len(data_y_smote))

Length of oversampled data is  82070
Number of no readmissions in oversampled data 41035
Number of readmissions 41035
Proportion of no readmissions data in oversampled data is  0.5
Proportion of readmissions data in oversampled data is  0.5


In [34]:
SMOTE_df = data_X_smote.join(data_y_smote)
SMOTE_df.head(5)

SMOTE_df.to_csv("../data/processed/SMOTE_df.csv", index=False)

# Downsampling

In [26]:
pos_samples = train_df[train_df['readmitted_rescaled'] == 1]
neg_samples = train_df[train_df['readmitted_rescaled'] == 0]
neg_samples = neg_samples.sample(n=len(pos_samples))

DS_train_df = pd.concat([pos_samples, neg_samples])
DS_train_df.shape

(8084, 382)

In [27]:
# Check the numbers of our data
print("Length of downsampled data is ", len(DS_train_df))
print("Number of no readmissions in oversampled data",
      len(DS_train_df[DS_train_df['readmitted_rescaled'] == 0]))
print("Number of readmissions", len(
    DS_train_df[DS_train_df['readmitted_rescaled'] == 1]))
print("Proportion of no readmissions data in oversampled data is ", len(
    DS_train_df[DS_train_df['readmitted_rescaled'] == 0])/len(DS_train_df))
print("Proportion of readmissions data in oversampled data is ", len(
    DS_train_df[DS_train_df['readmitted_rescaled'] == 1])/len(DS_train_df))

Length of downsampled data is  8084
Number of no readmissions in oversampled data 4042
Number of readmissions 4042
Proportion of no readmissions data in oversampled data is  0.5
Proportion of readmissions data in oversampled data is  0.5


In [35]:
DS_train_df.to_csv("../data/processed/DS_train_df.csv", index=False)