## DATA MINING PROJECT

### PART 1


In [1]:
# required libraries - pandas, numpy and scikit-learn (sklearn)

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")


In [2]:
# loading prepared data
data = pd.read_excel('diseases_hormonalmeasurements - Prepared Data.xlsx', sheet_name = 'Data')

In [3]:
# observing first few rows of data
data.head(10)

Unnamed: 0,PatientID,Gender,aldosterone,cortisol,dheas,corticosterone,11-Deoxycortisol,androstenedion,estrone,estradiol,...,dht,progesterone,androsterone,pregnenolone,cortisone,21-Deoxycortisol,11OHA4,11OHT,Disease,Sick
0,20001,F,0.275,11.508,8.458,1.075,0.007,0.191,,,...,,0.094,0.905,0.0,,,,,21OHD-C Disease,Yes
1,20002,M,0.015,0.459,0.375,0.044,0.016,0.106,,,...,,1.891,0.669,0.067,,,,,21OHD-C Disease,Yes
2,20003,F,0.633,4.398,93.912,14.022,0.433,1.004,,,...,,22.79,0.035,1.188,,,,,21OHD-C Disease,Yes
3,20004,F,0.03,19.652,172.925,1.123,0.125,0.644,,,...,,3.613,1.731,2.613,,,,,21OHD-C Disease,Yes
4,20005,F,0.059,3.663,30.385,0.776,0.154,1.128,,,...,,0.821,11.202,0.851,,,,,21OHD-C Disease,Yes
5,20006,F,0.055,4.267,13.123,0.458,0.084,0.563,,,...,,0.285,2.272,0.244,,,,,21OHD-C Disease,Yes
6,20007,M,0.132,5.243,5.773,47.685,0.27,0.434,,,...,,8.264,0.228,7.185,,,,,21OHD-C Disease,Yes
7,20008,F,0.172,3.228,53.745,1.347,0.07,1.894,,,...,,0.828,3.991,0.297,,,,,21OHD-C Disease,Yes
8,20009,F,0.894,8.769,189.604,0.891,0.617,1.395,,,...,,1.804,5.113,1.645,,,,,21OHD-C Disease,Yes
9,20010,F,0.093,18.104,3.241,1.466,0.036,0.132,,,...,,0.173,0.662,0.658,,,,,21OHD-C Disease,Yes


In [4]:
# gathering information on data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1034 entries, 0 to 1033
Data columns (total 25 columns):
PatientID                  1034 non-null int64
Gender                     1034 non-null object
aldosterone                1006 non-null float64
cortisol                   1029 non-null float64
dheas                      1027 non-null float64
corticosterone             1027 non-null float64
11-Deoxycortisol           1029 non-null float64
androstenedion             1029 non-null float64
estrone                    0 non-null float64
estradiol                  0 non-null float64
11-Deoksicortikosterone    1025 non-null float64
testosterone               1029 non-null float64
dhea                       1029 non-null float64
17-OH-Pregnenolone         1027 non-null float64
17-OH-Progesterone         1027 non-null float64
dht                        96 non-null float64
progesterone               1027 non-null float64
androsterone               1026 non-null float64
pregnenolone          

In [5]:
# identifying fraction of missign values in the dataset
missing_frac = np.round(data.isnull().sum()/len(data), 2)
missing_frac

PatientID                  0.00
Gender                     0.00
aldosterone                0.03
cortisol                   0.00
dheas                      0.01
corticosterone             0.01
11-Deoxycortisol           0.00
androstenedion             0.00
estrone                    1.00
estradiol                  1.00
11-Deoksicortikosterone    0.01
testosterone               0.00
dhea                       0.00
17-OH-Pregnenolone         0.01
17-OH-Progesterone         0.01
dht                        0.91
progesterone               0.01
androsterone               0.01
pregnenolone               0.04
cortisone                  0.24
21-Deoxycortisol           0.24
11OHA4                     0.76
11OHT                      0.76
Disease                    0.00
Sick                       0.00
dtype: float64

Few of data variables have very high fraction of missing values. We will remove all such columns from our data for preparing a Robust data model and therefore threshold for removing values is set at 0.23    

In [6]:
del_columns = list(missing_frac[missing_frac>0.23].index)

# Updating data by removing redundant columns
required_columns = list(set(data.columns) - set(del_columns))
data_2 = data[required_columns]

Writing a function to update missing values in the dataset

In [7]:
# missing values are updated with the average value calculated from other data points within the same disease category
cleaned_data = pd.DataFrame()

# creating a function to update values
def fill_missing(data, disease_var):
    filtered_data = data[data['Disease'] == disease_var]
    for data_column in data.columns:
        if filtered_data[data_column].dtypes.kind != 'O':
            filtered_data[data_column].fillna(filtered_data[data_column].mean(), inplace = True)
        else:
            filtered_data[data_column].fillna(filtered_data[data_column].mode(), inplace = True)
    return filtered_data


for disease_category in list(data_2['Disease'].unique()):
    interim_data = fill_missing(data_2, disease_category)
    cleaned_data = pd.concat([cleaned_data, interim_data])

In [8]:
# Checking whether all missing values have been addressed now
np.round(cleaned_data.isnull().sum()/len(cleaned_data), 2)

PatientID                  0.0
dheas                      0.0
11-Deoxycortisol           0.0
androsterone               0.0
dhea                       0.0
testosterone               0.0
Disease                    0.0
17-OH-Progesterone         0.0
aldosterone                0.0
cortisol                   0.0
Gender                     0.0
17-OH-Pregnenolone         0.0
11-Deoksicortikosterone    0.0
androstenedion             0.0
pregnenolone               0.0
progesterone               0.0
Sick                       0.0
corticosterone             0.0
dtype: float64

After cleaning the data, next step is putting data in correct format for machine learning algorithm. String variables in the data are updated to numeric entries to be easily feed into model fitting process

In [9]:
cleaned_data['Gender'] = cleaned_data['Gender'].map({'F':0, 'M':1})
cleaned_data['Sick'] = cleaned_data['Sick'].map({'No':0, 'Yes':1})

# setting columns in order and removing columns that are not required like Patient ID
column_order = ['Gender', 'aldosterone', 'cortisol', 'dheas','corticosterone', '11-Deoxycortisol',
               'androstenedion', '11-Deoksicortikosterone', 'testosterone', 'dhea', '17-OH-Pregnenolone',
               '17-OH-Progesterone', 'progesterone', 'androsterone', 'pregnenolone','Disease', 'Sick']

cleaned_data = cleaned_data[column_order]

# We do not need 'Disease' column in Part A therefore dropping that column

part_1 = cleaned_data.drop(['Disease'], axis = 1)


Next step is preparing the Random Forest Model. 

Splitting data into training and testing in 80-20% ratio using stratified sampling technique.

In [10]:
train, test = train_test_split(part_1, test_size = 0.20, stratify = part_1['Sick'], random_state = 9)

Normalizing the dataset using StandardScaler available in Scikit-Learn for data preprocessing

In [11]:
X_train = train.iloc[:,:-1].values
Y_train = train.iloc[:,-1].values

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values

# Normalizing data values for Random Forest Model
norm = StandardScaler()
X_train[:,1:] = norm.fit_transform(X_train[:,1:])
X_test[:,1:] = norm.transform(X_test[:,1:])

Training Random Forest Model

In [12]:
# training the Random Forest model
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

Generating the Confusion Matrix for test data

In [13]:
# Prediction the Test set
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
conf_matrix = confusion_matrix(Y_test, y_pred)
conf_matrix

array([[138,   4],
       [  7,  58]])

In [14]:
# Prediction Accuracy
classifier.score(X_test, Y_test)

0.9468599033816425

Hyper Parameter Tuning using GridSearch and 10-Fold Cross Validation to identify most suitable parameters for Random Forest Model

In [15]:
parameters = {'criterion': ['gini', 'entropy'],
             'max_depth': [5, 10, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2],
             'min_samples_split': [2, 5],
             'n_estimators': [5, 10, 15, 50, 75]}

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)

grid_search = grid_search.fit(X_train, Y_train)
best_accuracy = grid_search.best_score_
best_accuracy

0.9649334945586457

In [16]:
best_parameters = grid_search.best_params_
best_parameters

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 50}

In [23]:
# Final Model

best_classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', max_depth=10, max_features= 'auto', min_samples_leaf= 2, min_samples_split= 5, random_state = 13)
best_classifier.fit(X_train, Y_train)

# generating test prediction
y_pred = best_classifier.predict(X_test)

# Making the Confusion Matrix
conf_matrix = confusion_matrix(Y_test, y_pred)
conf_matrix

array([[138,   4],
       [  5,  60]])

In [24]:
# Prediction Accuracy
best_classifier.score(X_test, Y_test)

0.9565217391304348