In [47]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None  # default='warn'


### DATA PREPROCESSING

### Load Data

In [71]:
# df = pd.read_csv('ALF_Data.xlsx')
df = pd.read_excel('ALF_Data.xlsx', sheet_name='Sheet1')

In [72]:
df.head(10)

Unnamed: 0,Age,Gender,Region,Weight,Height,Body Mass Index,Obesity,Waist,Maximum Blood Pressure,Minimum Blood Pressure,...,PoorVision,Alcohol Consumption,HyperTension,Family HyperTension,Diabetes,Family Diabetes,Hepatitis,Family Hepatitis,Chronic Fatigue,ALF
0,65,M,east,56.0,162.1,21.31,0.0,83.6,135.0,71.0,...,0.0,1,0.0,0,0.0,1,1.0,0.0,0.0,0.0
1,36,M,south,60.2,162.2,22.88,0.0,76.6,96.0,52.0,...,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
2,66,M,east,83.9,162.5,31.77,1.0,113.2,115.0,57.0,...,0.0,1,0.0,0,1.0,0,0.0,0.0,0.0,0.0
3,54,M,east,69.4,160.5,26.94,0.0,77.9,110.0,57.0,...,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0,0.0
4,63,M,north,73.1,159.2,28.84,0.0,89.3,132.0,73.0,...,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0,0.0
5,26,F,east,119.3,193.2,31.96,1.0,117.9,129.0,70.0,...,0.0,0,0.0,1,0.0,0,0.0,0.0,0.0,0.0
6,66,F,north,85.1,172.1,28.73,0.0,99.2,137.0,92.0,...,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0,0.0
7,59,M,east,69.9,160.9,27.0,0.0,101.5,124.0,73.0,...,0.0,0,0.0,1,1.0,1,0.0,0.0,0.0,0.0
8,53,M,east,75.2,174.1,24.81,0.0,85.6,110.0,74.0,...,0.0,1,1.0,1,0.0,0,0.0,0.0,0.0,0.0
9,78,M,north,47.6,155.3,19.74,0.0,70.3,170.0,78.0,...,0.0,0,1.0,0,1.0,1,1.0,0.0,,0.0


In [73]:
# Check how many rows / columns in the dataframe
df.shape

(8785, 30)

### Checking if there are missing values

In [74]:
# Check if there are missing values in the entire dataframe
df.isnull().values.any()

True

In [75]:
# Check if there are missing values in the target ALF column
df.ALF.unique()

array([ 0.,  1., nan])

In [76]:
# Check if there are missing values in the target ALF column
df.ALF.isnull().values.any()

True

In [77]:
# How many missing values in the target ALF column
df.ALF.isnull().sum()

2785

In [78]:
# What are the rows in the Target ALF column that have NaN values
# df[df.ALF.isnull()]

In [79]:
# remove the rows that have NaN (missing values) in the target ALF column

data = df.dropna(subset=['ALF'])
data.shape

(6000, 30)

In [80]:
# Get unique values for Gender column
data.Gender.unique()

array(['M', 'F'], dtype=object)

In [81]:
# Replace nominal values with numbers for
# Categorical column: Gender
# data.Gender = data.Gender.replace(['M', 'F'], [1, 0])
# METADATA
# Gender: Male = 1, Female = 0  
data['Gender'] = data['Gender'].map({'M': 1, 'F': 0})
data.head()

Unnamed: 0,Age,Gender,Region,Weight,Height,Body Mass Index,Obesity,Waist,Maximum Blood Pressure,Minimum Blood Pressure,...,PoorVision,Alcohol Consumption,HyperTension,Family HyperTension,Diabetes,Family Diabetes,Hepatitis,Family Hepatitis,Chronic Fatigue,ALF
0,65,1,east,56.0,162.1,21.31,0.0,83.6,135.0,71.0,...,0.0,1,0.0,0,0.0,1,1.0,0.0,0.0,0.0
1,36,1,south,60.2,162.2,22.88,0.0,76.6,96.0,52.0,...,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
2,66,1,east,83.9,162.5,31.77,1.0,113.2,115.0,57.0,...,0.0,1,0.0,0,1.0,0,0.0,0.0,0.0,0.0
3,54,1,east,69.4,160.5,26.94,0.0,77.9,110.0,57.0,...,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0,0.0
4,63,1,north,73.1,159.2,28.84,0.0,89.3,132.0,73.0,...,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0,0.0


In [82]:
# Get the unique values for Region column
data.Region.unique()

array(['east', 'south', 'north', 'west'], dtype=object)

In [83]:
# Replace nominal values with numbers for
# Categorical column: Region
# METADATA
# Gender: east = 1, south = 2, north = 3, west = 4  
data.Region = data.Region.replace(['east', 'south', 'north', 'west'], [1, 2, 3, 4])
data.head()

Unnamed: 0,Age,Gender,Region,Weight,Height,Body Mass Index,Obesity,Waist,Maximum Blood Pressure,Minimum Blood Pressure,...,PoorVision,Alcohol Consumption,HyperTension,Family HyperTension,Diabetes,Family Diabetes,Hepatitis,Family Hepatitis,Chronic Fatigue,ALF
0,65,1,1,56.0,162.1,21.31,0.0,83.6,135.0,71.0,...,0.0,1,0.0,0,0.0,1,1.0,0.0,0.0,0.0
1,36,1,2,60.2,162.2,22.88,0.0,76.6,96.0,52.0,...,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
2,66,1,1,83.9,162.5,31.77,1.0,113.2,115.0,57.0,...,0.0,1,0.0,0,1.0,0,0.0,0.0,0.0,0.0
3,54,1,1,69.4,160.5,26.94,0.0,77.9,110.0,57.0,...,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0,0.0
4,63,1,3,73.1,159.2,28.84,0.0,89.3,132.0,73.0,...,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0,0.0


In [84]:
data['Source of Care'].unique()

array(['Governament Hospital', 'Never Counsulted', 'Private Hospital',
       'clinic', ' '], dtype=object)

In [85]:
# Replace nominal values with numbers for
# Categorical column: Source of Care
# METADATA
# Gender: Governament Hospital = 1, Private Hospital = 2, clinic = 3, 'Never Counsulted = 0
# Note: replaced the empty values (' ') with Nan
data['Source of Care'] = data['Source of Care'].replace(['Governament Hospital', 'Private Hospital', 'clinic', 'Never Counsulted', ' '], [1, 2, 3, 0, np.nan])
data.head()

Unnamed: 0,Age,Gender,Region,Weight,Height,Body Mass Index,Obesity,Waist,Maximum Blood Pressure,Minimum Blood Pressure,...,PoorVision,Alcohol Consumption,HyperTension,Family HyperTension,Diabetes,Family Diabetes,Hepatitis,Family Hepatitis,Chronic Fatigue,ALF
0,65,1,1,56.0,162.1,21.31,0.0,83.6,135.0,71.0,...,0.0,1,0.0,0,0.0,1,1.0,0.0,0.0,0.0
1,36,1,2,60.2,162.2,22.88,0.0,76.6,96.0,52.0,...,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
2,66,1,1,83.9,162.5,31.77,1.0,113.2,115.0,57.0,...,0.0,1,0.0,0,1.0,0,0.0,0.0,0.0,0.0
3,54,1,1,69.4,160.5,26.94,0.0,77.9,110.0,57.0,...,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0,0.0
4,63,1,3,73.1,159.2,28.84,0.0,89.3,132.0,73.0,...,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0,0.0


In [86]:
# Checking how many missing values (NaN) 
# in each column of the new dataframe: data

total_missingvalues = data.isnull().sum()
total_missingvalues

Age                         0
Gender                      0
Region                      0
Weight                    133
Height                    139
Body Mass Index           206
Obesity                   206
Waist                     215
Maximum Blood Pressure    206
Minimum Blood Pressure    252
Good Cholesterol            8
Bad Cholesterol             8
Total Cholesterol           6
Dyslipidemia                0
PVD                         0
Physical Activity           8
Education                  15
Unmarried                 301
Income                    792
Source of Care              2
PoorVision                376
Alcohol Consumption         0
HyperTension               53
Family  HyperTension        0
Diabetes                    1
Family Diabetes             0
Hepatitis                  13
Family Hepatitis            3
Chronic Fatigue            26
ALF                         0
dtype: int64

In [87]:
data.columns

Index(['Age', 'Gender', 'Region', 'Weight', 'Height', 'Body Mass Index',
       'Obesity', 'Waist', 'Maximum Blood Pressure', 'Minimum Blood Pressure',
       'Good Cholesterol', 'Bad Cholesterol', 'Total Cholesterol',
       'Dyslipidemia', 'PVD', 'Physical Activity', 'Education', 'Unmarried',
       'Income', 'Source of Care', 'PoorVision', 'Alcohol Consumption',
       'HyperTension', 'Family  HyperTension', 'Diabetes', 'Family Diabetes',
       'Hepatitis', 'Family Hepatitis', 'Chronic Fatigue', 'ALF'],
      dtype='object')

### Separating Continous from Categorical features

In [88]:
# data.columns

# Categorical Features

cat_features  = data[ ['Age',  'Gender', 'Region', 'Obesity', 'Dyslipidemia', 'PVD', 'Physical Activity', 'Education', 'Unmarried','Income', 'Source of Care', 'PoorVision', 'Alcohol Consumption', 'HyperTension', 'Family  HyperTension', 'Diabetes', 'Family Diabetes', 'Hepatitis', 'Family Hepatitis', 'Chronic Fatigue'] ]

# Continous Features 

cont_features = data[ ['Weight', 'Height', 'Body Mass Index', 'Waist', 'Maximum Blood Pressure', 'Minimum Blood Pressure', 'Good Cholesterol', 'Bad Cholesterol', 'Total Cholesterol']]


### Dealing with Continous-Data Features: Replacing Missing Values with the Mean

In [89]:
# See Continous-data features
cont_features.head()

Unnamed: 0,Weight,Height,Body Mass Index,Waist,Maximum Blood Pressure,Minimum Blood Pressure,Good Cholesterol,Bad Cholesterol,Total Cholesterol
0,56.0,162.1,21.31,83.6,135.0,71.0,48.0,249.0,297.0
1,60.2,162.2,22.88,76.6,96.0,52.0,31.0,135.0,166.0
2,83.9,162.5,31.77,113.2,115.0,57.0,44.0,211.0,255.0
3,69.4,160.5,26.94,77.9,110.0,57.0,74.0,156.0,230.0
4,73.1,159.2,28.84,89.3,132.0,73.0,67.0,154.0,221.0


In [90]:
# Replacing all missing values (NaN) for Continous Features
# with the Column mean
cont_features = cont_features.fillna(c_features.mean())
# c_features.head()

cont_features.head()

Unnamed: 0,Weight,Height,Body Mass Index,Waist,Maximum Blood Pressure,Minimum Blood Pressure,Good Cholesterol,Bad Cholesterol,Total Cholesterol
0,56.0,162.1,21.31,83.6,135.0,71.0,48.0,249.0,297.0
1,60.2,162.2,22.88,76.6,96.0,52.0,31.0,135.0,166.0
2,83.9,162.5,31.77,113.2,115.0,57.0,44.0,211.0,255.0
3,69.4,160.5,26.94,77.9,110.0,57.0,74.0,156.0,230.0
4,73.1,159.2,28.84,89.3,132.0,73.0,67.0,154.0,221.0


In [91]:
# Are there any more missing values?
cont_features.isnull().values.any()

False

In [100]:
cont_features.shape

(6000, 9)

### Dealing with Categorical-Data Features: Replacing Missing Values with Most Frequent (Mode)

In [92]:
# See Continous-data features
cat_features.head()

Unnamed: 0,Age,Gender,Region,Obesity,Dyslipidemia,PVD,Physical Activity,Education,Unmarried,Income,Source of Care,PoorVision,Alcohol Consumption,HyperTension,Family HyperTension,Diabetes,Family Diabetes,Hepatitis,Family Hepatitis,Chronic Fatigue
0,65,1,1,0.0,0,0,3.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0,0.0,1,1.0,0.0,0.0
1,36,1,2,0.0,0,0,3.0,0.0,,1.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0
2,66,1,1,1.0,1,0,1.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0,1.0,0,0.0,0.0,0.0
3,54,1,1,0.0,0,0,2.0,1.0,0.0,0.0,2.0,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0
4,63,1,3,0.0,0,0,1.0,0.0,0.0,,3.0,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0


In [94]:
cat_features = cat_features.fillna(cat_features.mode().iloc[0])

In [96]:
cat_features.head(10)

Unnamed: 0,Age,Gender,Region,Obesity,Dyslipidemia,PVD,Physical Activity,Education,Unmarried,Income,Source of Care,PoorVision,Alcohol Consumption,HyperTension,Family HyperTension,Diabetes,Family Diabetes,Hepatitis,Family Hepatitis,Chronic Fatigue
0,65,1,1,0.0,0,0,3.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0,0.0,1,1.0,0.0,0.0
1,36,1,2,0.0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0
2,66,1,1,1.0,1,0,1.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0,1.0,0,0.0,0.0,0.0
3,54,1,1,0.0,0,0,2.0,1.0,0.0,0.0,2.0,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0
4,63,1,3,0.0,0,0,1.0,0.0,0.0,0.0,3.0,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0
5,26,0,1,1.0,0,0,2.0,1.0,0.0,0.0,2.0,0.0,0,0.0,1,0.0,0,0.0,0.0,0.0
6,66,0,3,0.0,0,0,3.0,1.0,0.0,0.0,2.0,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0
7,59,1,1,0.0,0,0,2.0,1.0,1.0,0.0,2.0,0.0,0,0.0,1,1.0,1,0.0,0.0,0.0
8,53,1,1,0.0,0,0,1.0,1.0,0.0,1.0,2.0,0.0,1,1.0,1,0.0,0,0.0,0.0,0.0
9,78,1,3,0.0,0,0,1.0,0.0,1.0,0.0,2.0,0.0,0,1.0,0,1.0,1,1.0,0.0,0.0


In [97]:
cat_features.isnull().values.any()

False

In [99]:
cat_features.shape

(6000, 20)

In [102]:
data2 = pd.concat([cont_features, cat_features], axis=1)
data2.head()

Unnamed: 0,Weight,Height,Body Mass Index,Waist,Maximum Blood Pressure,Minimum Blood Pressure,Good Cholesterol,Bad Cholesterol,Total Cholesterol,Age,...,Source of Care,PoorVision,Alcohol Consumption,HyperTension,Family HyperTension,Diabetes,Family Diabetes,Hepatitis,Family Hepatitis,Chronic Fatigue
0,56.0,162.1,21.31,83.6,135.0,71.0,48.0,249.0,297.0,65,...,1.0,0.0,1,0.0,0,0.0,1,1.0,0.0,0.0
1,60.2,162.2,22.88,76.6,96.0,52.0,31.0,135.0,166.0,36,...,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0
2,83.9,162.5,31.77,113.2,115.0,57.0,44.0,211.0,255.0,66,...,0.0,0.0,1,0.0,0,1.0,0,0.0,0.0,0.0
3,69.4,160.5,26.94,77.9,110.0,57.0,74.0,156.0,230.0,54,...,2.0,0.0,1,0.0,0,0.0,0,0.0,0.0,0.0
4,73.1,159.2,28.84,89.3,132.0,73.0,67.0,154.0,221.0,63,...,3.0,0.0,0,1.0,0,0.0,0,0.0,0.0,0.0


### Train / Test / Validation Split

In [None]:
train_ratio = 0.70
validation_ratio = 0.10
test_ratio = 0.20

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

print(x_train, x_val, x_test)