In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!pip install -q xlrd

In [None]:
df = pd.read_excel("Warfarin_Dose_Prediction_Modified.xls", 'Subject Data')

In [None]:
df.head()

Unnamed: 0,Gender,Race (Reported),Age,Height (cm),Weight (kg),Diabetes,Medications,Simvastatin (Zocor),Atorvastatin (Lipitor),Amiodarone (Cordarone),Target INR,Therapeutic Dose of Warfarin,INR on Reported Therapeutic Dose of Warfarin,Cyp2C9 genotypes,VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
0,male,White,60 - 69,193.04,115.7,,aspirin; not simvastatin; not amiodarone,0.0,,0.0,2.5,49.0,2.6,*1/*1,A/G
1,female,White,50 - 59,176.53,144.2,,not aspirin; not simvastatin; not amiodarone,0.0,,0.0,2.5,42.0,2.15,*1/*1,A/A
2,female,White,40 - 49,162.56,77.1,,not aspirin; not simvastatin; not amiodarone,0.0,,0.0,2.5,53.0,1.9,*1/*1,G/G
3,male,White,60 - 69,182.245,90.7,,not aspirin; not simvastatin; not amiodarone,0.0,,0.0,2.5,28.0,2.4,*1/*1,A/G
4,male,White,50 - 59,167.64,72.6,,not aspirin; not simvastatin; not amiodarone,0.0,,0.0,2.5,42.0,1.9,*1/*3,A/G


In [None]:
df.columns

Index(['Gender', 'Race (Reported)', 'Age', 'Height (cm)', 'Weight (kg)',
       'Diabetes', 'Medications', 'Simvastatin (Zocor)',
       'Atorvastatin (Lipitor)', 'Amiodarone (Cordarone)', 'Target INR',
       'Therapeutic Dose of Warfarin',
       'INR on Reported Therapeutic Dose of Warfarin', 'Cyp2C9 genotypes',
       'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'],
      dtype='object')

In [None]:
#Converting Categorical text data into numerical
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Convert the categorical columns with the LabelEncoder
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column].astype(str))

df.head()

Unnamed: 0,Gender,Race (Reported),Age,Height (cm),Weight (kg),Diabetes,Medications,Simvastatin (Zocor),Atorvastatin (Lipitor),Amiodarone (Cordarone),Target INR,Therapeutic Dose of Warfarin,INR on Reported Therapeutic Dose of Warfarin,Cyp2C9 genotypes,VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
0,1,20,5,193.04,115.7,,1029,0.0,,0.0,2.5,49.0,2.6,0,1
1,0,20,4,176.53,144.2,,1791,0.0,,0.0,2.5,42.0,2.15,0,0
2,0,20,3,162.56,77.1,,1791,0.0,,0.0,2.5,53.0,1.9,0,2
3,1,20,5,182.245,90.7,,1791,0.0,,0.0,2.5,28.0,2.4,0,1
4,1,20,4,167.64,72.6,,1791,0.0,,0.0,2.5,42.0,1.9,5,1


In [None]:
missing_values = df.isnull().sum()
print(missing_values)

Gender                                                                  0
Race (Reported)                                                         0
Age                                                                     0
Height (cm)                                                          1146
Weight (kg)                                                           287
Diabetes                                                             2417
Medications                                                             0
Simvastatin (Zocor)                                                  1839
Atorvastatin (Lipitor)                                               3245
Amiodarone (Cordarone)                                               1518
Target INR                                                           4441
Therapeutic Dose of Warfarin                                          172
INR on Reported Therapeutic Dose of Warfarin                          732
Cyp2C9 genotypes                      

In [None]:
#Missing value treatment
from sklearn.impute import SimpleImputer, MissingIndicator

# Technique 1: Impute the mean for all missing values in a variable
# For numerical columns
for column in df.select_dtypes(include=['int64', 'float64']).columns:
    df[column].fillna(df[column].mean(), inplace=True)

# For categorical columns
for column in df.select_dtypes(include=['object']).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Technique 2: Using Missingness Indicator Variable (MIV) to handle missingness
# Create a new variable that is an indicator of missingness
indicator = MissingIndicator(features='missing-only')
mis_ind = indicator.fit_transform(df)

# Add the indicator to the dataframe
for i, column in enumerate(indicator.features_):
    df[column + '_missing'] = mis_ind[:, i]

df.head()

Unnamed: 0,Gender,Race (Reported),Age,Height (cm),Weight (kg),Diabetes,Medications,Simvastatin (Zocor),Atorvastatin (Lipitor),Amiodarone (Cordarone),Target INR,Therapeutic Dose of Warfarin,INR on Reported Therapeutic Dose of Warfarin,Cyp2C9 genotypes,VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
0,1,20,5,193.04,115.7,0.187024,1029,0.0,0.089613,0.0,2.5,49.0,2.6,0,1
1,0,20,4,176.53,144.2,0.187024,1791,0.0,0.089613,0.0,2.5,42.0,2.15,0,0
2,0,20,3,162.56,77.1,0.187024,1791,0.0,0.089613,0.0,2.5,53.0,1.9,0,2
3,1,20,5,182.245,90.7,0.187024,1791,0.0,0.089613,0.0,2.5,28.0,2.4,0,1
4,1,20,4,167.64,72.6,0.187024,1791,0.0,0.089613,0.0,2.5,42.0,1.9,5,1


In [None]:
#checking missing values after treatment
missing_values = df.isnull().sum()
print(missing_values)

Gender                                                               0
Race (Reported)                                                      0
Age                                                                  0
Height (cm)                                                          0
Weight (kg)                                                          0
Diabetes                                                             0
Medications                                                          0
Simvastatin (Zocor)                                                  0
Atorvastatin (Lipitor)                                               0
Amiodarone (Cordarone)                                               0
Target INR                                                           0
Therapeutic Dose of Warfarin                                         0
INR on Reported Therapeutic Dose of Warfarin                         0
Cyp2C9 genotypes                                                     0
VKORC1

In [None]:
import numpy as np

# Construct the feature dataset (X) and label variable (Y)
X = df.drop('Therapeutic Dose of Warfarin' , axis=1).values
Y = df['Therapeutic Dose of Warfarin'].values

# Save them into numpy variables
np.save('X.npy', X)
np.save('Y.npy', Y)