In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [13]:
#import pandas as pd # Importing the pandas library and aliasing it as 'pd'
data = pd.read_csv('sample_data/diabetic_data.csv') # Reading data from 'diabetic_data.csv' into a pandas DataFrame called 'data'

In [3]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [4]:
non_numeric_columns = data.select_dtypes(exclude=['number']).columns
print(non_numeric_columns)
print(len(non_numeric_columns))

Index(['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty',
       'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')
37


In [5]:
# Count the number of null values in each column
null_counts = data.isnull().sum()

# Get the names of columns with null values
columns_with_null = null_counts[null_counts > 0].index.tolist()

# Check for NaN values in each column
nan_counts = data.isna().sum()

# Get the names of columns with NaN values
columns_with_nan = nan_counts[nan_counts > 0].index.tolist()

# Combine the columns with null and NaN values
columns_with_missing = list(set(columns_with_null + columns_with_nan))

# Print the column name
print("Columns with missing values:")
for column in columns_with_missing:
    print(column)

Columns with missing values:
A1Cresult
max_glu_serum


In [6]:
data.shape

(101766, 50)

In [7]:
data.dtypes

Unnamed: 0,0
encounter_id,int64
patient_nbr,int64
race,object
gender,object
age,object
weight,object
admission_type_id,int64
discharge_disposition_id,int64
admission_source_id,int64
time_in_hospital,int64


In [60]:
X = data.copy()
y = X['readmitted']
X.drop(['weight','readmitted'], axis = 1, inplace = True)

# get drop list with columns consisted with 80% or more of same values. Mostly null values.
drop_list = []
for col in X.columns:
    # Use .loc for safer inplace replacement
    X.loc[:, col] = X.loc[:, col].replace('?', None)
    X.loc[:, col] = X.loc[:, col].replace('No', 0)
    X.loc[:, col] = X.loc[:, col].replace('None', 0)

    val_counts = X[col].value_counts()
    if val_counts.iloc[0] / val_counts.sum() > 0.8:
        drop_list.append(col)

for elem in drop_list:
    X.drop([elem], axis=1, inplace=True)

X


#23!

  X.loc[:, col] = X.loc[:, col].replace('No', 0)
  X.loc[:, col] = X.loc[:, col].replace('No', 0)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,insulin,change,diabetesMed
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,,...,0,250.83,,,1,,,0,0,0
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,,...,0,276,250.01,255,9,,,Up,Ch,Yes
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,,...,1,648,250,V27,6,,,0,0,Yes
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,,...,0,8,250.43,403,7,,,Up,Ch,Yes
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,,...,0,197,157,250,5,,,Steady,Ch,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),1,3,7,3,MC,...,0,250.13,291,458,9,,>8,Down,Ch,Yes
101762,443847782,74694222,AfricanAmerican,Female,[80-90),1,4,5,5,MC,...,1,560,276,787,9,,,Steady,0,Yes
101763,443854148,41088789,Caucasian,Male,[70-80),1,1,7,1,MC,...,0,38,590,296,13,,,Down,Ch,Yes
101764,443857166,31693671,Caucasian,Female,[80-90),2,3,7,10,MC,...,1,996,285,998,9,,,Up,Ch,Yes


In [81]:
import warnings

# Ignore specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas") # Ignore FutureWarnings from pandas
warnings.filterwarnings("ignore", message="A value is trying to be set on a copy") # Ignore specific message


#prepare y with int values
y.replace('>30', 2, inplace = True)
y.replace('<30', 1, inplace = True)
y.replace('NO', 0, inplace = True)

#prepare age column by converting object to int values
for val in list(set(X['age'].values)):
    # Check if val is a string before slicing
    if isinstance(val, str):
        X['age'].replace(val, val[-3:-1], inplace = True)
    else:
        # Handle non-string values (e.g., keep as they are)
        pass
X['age'].replace('00', 100, inplace = True)

for val in list(set(X['age'].values)):
    X['age'].replace(val, int(val), inplace = True)

#prepare insulin column converting objects to int
X['insulin'].replace('Down', -1, inplace = True)
X['insulin'].replace('No', 0, inplace = True)
X['insulin'].replace('Steady', 1, inplace = True)
X['insulin'].replace('Up', 2, inplace = True)

#prepare change column converting objects to int
X['change'].replace('Ch', 1, inplace = True)
X['change'].replace('No', 0, inplace = True)

#prepare diabetesMed column
X['diabetesMed'].replace('No', 0, inplace = True)
X['diabetesMed'].replace('Yes', 1, inplace = True)

for val in list(set(X['diag_1'].values)):
    # Check if val is a string before accessing its elements
    if isinstance(val, str) and val != None and val[0].isalpha():
        X['diag_1'].replace(val, None, inplace = True)
    # Handle cases where val is None or a number
    # No need for an explicit elif for None since it's already covered
    # in the isinstance check
    else:
        # Do nothing if val is not a string or is None
        # or if it does not start with an alphabetic character
        pass
# Similar modifications for diag_2 and diag_3 columns
for val in list(set(X['diag_2'].values)):
    if isinstance(val, str) and val != None and val[0].isalpha():
        X['diag_2'].replace(val, None, inplace = True)
    else:
        pass

for val in list(set(X['diag_3'].values)):
    if isinstance(val, str) and val != None and val[0].isalpha():
        X['diag_3'].replace(val, None, inplace = True)
    else:
        pass

In [67]:
# Define the mapping of categories to numerical values
# age_mapping = {'[0-10)': 1, '[10-20)': 2, '[20-30)': 3, '[30-40)': 4, '[40-50)': 5,
#                '[50-60)': 6, '[60-70)': 7, '[70-80)': 8, '[80-90)': 9, '[90-100)': 10}

# # Replace the categorical values with numerical values using the mapping
# X['age'] = X['age'].replace(age_mapping)
le = LabelEncoder()
for col in X.columns:
    if X.dtypes[col] == object:
        X[col] = le.fit_transform(X[col])


In [68]:
X.fillna(X.mean(), inplace=True)

In [69]:
X.shape

(101766, 24)

In [70]:
X.dtypes

Unnamed: 0,0
encounter_id,int64
patient_nbr,int64
race,int64
gender,int64
age,int64
admission_type_id,int64
discharge_disposition_id,int64
admission_source_id,int64
time_in_hospital,int64
payer_code,int64


In [72]:
X.drop(['encounter_id','patient_nbr','payer_code'], axis = 1, inplace = True)

In [73]:
X

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,insulin,change,diabetesMed
0,2,0,10,6,25,1,1,37,41,0,...,0,124,650,670,1,3,3,0,0,0
1,2,0,20,1,1,7,3,72,59,0,...,0,143,79,121,9,3,3,2,1,1
2,0,0,30,1,1,7,2,72,11,5,...,1,454,78,670,6,3,3,0,0,1
3,2,1,40,1,1,7,2,72,44,1,...,0,554,97,248,7,3,3,2,1,1
4,2,1,50,1,1,7,1,72,51,0,...,0,54,24,86,5,3,3,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,0,1,80,1,3,7,3,72,51,0,...,0,102,148,295,9,3,1,-1,1,1
101762,0,0,90,1,4,5,5,72,33,3,...,1,381,133,536,9,3,3,1,0,1
101763,2,1,80,1,1,7,1,72,53,0,...,0,236,389,159,13,3,3,-1,1,1
101764,2,0,90,2,3,7,10,62,45,2,...,1,693,142,668,9,3,3,2,1,1


In [75]:
from sklearn.model_selection import train_test_split

# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
X_train.shape

(81412, 21)

In [80]:
y_train.shape

(81412,)