# Connect to drive

In [None]:
# Connecting to drive
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/Colab Notebooks/Statistical Learning for Healthcare Data/Project/Scripts/Final code

Mounted at /gdrive
/gdrive/My Drive/Colab Notebooks/Statistical Learning for Healthcare Data/Project/Scripts/Final code


# Import modules

In [None]:
import random

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt

# Splitting dataset in training set and test set
from sklearn.model_selection import train_test_split

# For the data preparation before training the model
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
# Set seed
random.seed(562023)

# Data loading

In [None]:
# Loading the clean dataset
df = pd.read_csv("Final data clean manual.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ID,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,...,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,Target
0,0,1,77.0,1,2.0,1.0,1.0,2.0,3.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,1,2,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0
2,2,3,52.0,1,0.0,0.0,0.0,2.0,2.0,0.0,...,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0
3,3,4,68.0,0,0.0,0.0,0.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
4,4,5,60.0,1,0.0,0.0,0.0,2.0,3.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0


In [None]:
# Dataframe dimensions
df.shape

(1554, 105)

In [None]:
# Dropping unnecessary features
df = df.drop(['Unnamed: 0', 'ID'], axis = 1)
df.head()

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,Target
0,77.0,1,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0
2,52.0,1,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,...,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0
3,68.0,0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
4,60.0,1,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0


In [None]:
# Checking for missing values
df.isna().sum().sum()

0

# Data preprocessing

We decide to merge the following features:
* 'R_AB_1_n', 'R_AB_2_n' and 'R_AB_3_n' in 'R_AB' (relapse of the pain)
* 'NA_R_1_n', 'NA_R_2_n' and 'NA_R_3_n' in 'NA_R' (Use of opioid drugs in the ICU)
* 'NOT_NA_1_n', 'NOT_NA_1_n' and 'NOT_NA_1_n' in 'NOT_NA' (use of NSAIDs in the ICU)
This seams reasonable since we are not interested in the timeline of the hospitalization, we are only interested in knowing if the patients are experiancing relaps of pain or are taking some drugs while they are hospitalized.

In [None]:
# R_AB variable
df['R_AB'] = 1
df.loc[(df.R_AB_1_n == 0) & (df.R_AB_2_n == 0) & (df.R_AB_3_n == 0), 'R_AB'] = 0
df = df.drop(['R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n'], axis = 1)

# NA_R variable
df['NA_R'] = 1
df.loc[(df.NA_R_1_n == 0) & (df.NA_R_2_n == 0) & (df.NA_R_3_n == 0), 'NA_R'] = 0
df = df.drop(['NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n'], axis = 1)

# NOT_NA variable
df['NOT_NA'] = 1
df.loc[(df.NOT_NA_1_n == 0) & (df.NOT_NA_2_n == 0) & (df.NOT_NA_3_n == 0), 'NOT_NA'] = 0
df = df.drop(['NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n'], axis = 1)

# Rearranging the variables orders
y = df[['Target']]
df = df.drop(['Target'], axis = 1)
df.insert(96, 'Target', y)

# Print
print(df.head())
print("Dimension of df: ", df.shape)
print("Missing values of df: ", df.isna().sum().sum())

    AGE  SEX  INF_ANAM  STENOK_AN  FK_STENOK  IBS_POST   GB  SIM_GIPERT  \
0  77.0    1       2.0        1.0        1.0       2.0  3.0         0.0   
1  55.0    1       1.0        0.0        0.0       0.0  0.0         0.0   
2  52.0    1       0.0        0.0        0.0       2.0  2.0         0.0   
3  68.0    0       0.0        0.0        0.0       2.0  2.0         0.0   
4  60.0    1       0.0        0.0        0.0       2.0  3.0         0.0   

   DLIT_AG  ZSN_A  ...  B_BLOK_S_n  ANT_CA_S_n  GEPAR_S_n  ASP_S_n  TIKL_S_n  \
0      7.0    0.0  ...         0.0         0.0        1.0      1.0       0.0   
1      0.0    0.0  ...         0.0         1.0        1.0      1.0       0.0   
2      2.0    0.0  ...         1.0         0.0        1.0      1.0       0.0   
3      3.0    1.0  ...         0.0         1.0        1.0      1.0       0.0   
4      7.0    0.0  ...         0.0         1.0        0.0      1.0       0.0   

   TRENT_S_n  R_AB  NA_R  NOT_NA  Target  
0        0.0     1     0 

We need to separate the variables which are going to be the input in our classifier and the target variable. Then we procede by splitting the dataset in train and test set.

In [None]:
# Split the data into features (X) and target (y)
X = df.drop('Target', axis=1)
y = df['Target']

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

If the distribution of the quantity is normal, then it should be standardized, otherwise, the data should be normalized. This applies if the range of quantity values is large or small. Standardization can give values that are both positive and negative centered around zero. It may be desirable to normalize data after it has been standardized.

In [None]:
# Test whether a sample differs from a normal distribution
cont_feats = ['AGE', 'S_AD_ORIT', 'D_AD_ORIT', 'K_BLOOD', 'NA_BLOOD', 'ALT_BLOOD',
              'AST_BLOOD', 'L_BLOOD', 'ROE']
alpha = 0.05

for i in cont_feats:
  k2, p = stats.normaltest(X_train[i])
  if p < alpha:
    print("The null hypothesis can be rejected")
  else:
    print("The null hypothesis cannot be rejected")

The null hypothesis can be rejected
The null hypothesis can be rejected
The null hypothesis can be rejected
The null hypothesis can be rejected
The null hypothesis can be rejected
The null hypothesis can be rejected
The null hypothesis can be rejected
The null hypothesis can be rejected
The null hypothesis can be rejected


We fit the scaler using the training data and then apply the scaler on the testing data before the prediction.

In [None]:
# Normalization of the continuous features
# X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
scaler=MinMaxScaler()
train_scaled = scaler.fit_transform(X_train)

# X_scaled = X_std * (max - min) + min
X_train_s = X_train.copy()
X_train_s = pd.DataFrame(train_scaled, columns=X.columns)

X_train = X_train_s
X_train.head()

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,R_AB,NA_R,NOT_NA
0,0.553846,1.0,0.0,0.0,0.0,0.0,0.666667,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
1,0.276923,1.0,0.0,0.166667,1.0,0.5,0.666667,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
2,0.630769,1.0,0.0,0.833333,0.5,0.5,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.507692,1.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.676923,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# Applying the scaler on the test set
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
X_test.head()

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,R_AB,NA_R,NOT_NA
0,0.784615,0.0,0.333333,1.0,0.5,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
1,0.723077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.384615,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.923077,0.0,0.0,0.0,0.0,1.0,0.666667,0.0,1.0,0.25,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.153846,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


Let's have a look at the class distribution

In [None]:
y.sum()/len(y), y_train.sum()/len(y_train), y_test.sum()/len(y_test)

(0.15701415701415702, 0.16007359705611776, 0.14989293361884368)

The challenge of working with imbalanced datasets is that most machine learning techniques will ignore, and in turn have poor performance on, the minority class, although typically it is performance on the minority class that is most important.

One approach to addressing imbalanced datasets is to oversample the minority class. The simplest approach involves duplicating examples in the minority class, although these examples don’t add any new information to the model. Instead, new examples can be synthesized from the existing examples. This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or SMOTE for short.

In [None]:
# SMOTE for oversampling
oversample = SMOTE(random_state=45)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
# Randomly shuffle the rows of the datafram
df = pd.concat([X_train, y_train], axis=1)
df = df.sample(frac=1, random_state=0)
df.index = np.arange(0,df.shape[0],1)
df.head()

In [None]:
X_train = df.drop('Target', axis=1)
y_train = df.Target

# Data saving

In [None]:
# Saving the train set for future models
train_set = pd.concat([X_train, y_train], axis=1)
train_set.to_csv('train_set.csv')

In [None]:
X_train.to_csv('train_set_no_smote.csv')

In [None]:
# Saving the test set for future models
X_test.index = np.arange(0,X_test.shape[0],1)
y_test.index = np.arange(0,X_test.shape[0],1)
test_set = pd.concat([X_test, y_test], axis=1)
test_set.to_csv('test_set.csv')

In [None]:
X_test.to_csv('test_set_no_smote.csv')