In [27]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
np.random.seed(0)

In [28]:
#Read in health score data
healthScores = pd.read_csv('HealthScores.csv')

#Add a column (feature) to the health score data set for BMI (BMI is a value calculated from weight and height)

#BMI = weight (in kg) / (height (in m))^2
#See https://www.nhs.uk/chq/Pages/how-can-i-work-out-my-bmi.aspx

#copy healthScores data frame into a temporary data frame
temp = healthScores.copy()

#convert weights from lbs to kg
temp['Weight in kg'] = temp['Weight  in lbs'] * 0.453592

#convert heights from inches to metres
temp['Height in m'] = temp['Height in Inch'] * 0.0254

#calculate BMIs and store them in a new column titled "BMI"
healthScores['BMI'] = temp['Weight in kg'] / (temp['Height in m'] **2)

healthScores.head()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI
0,32,Female,101,67,110,0,0,Inactive,251,15.818661
1,47,Female,138,67,84,2,0,Active,193,21.613616
2,25,Male,166,73,106,2,5,Inactive,167,21.900817
3,75,Male,184,73,99,2,0,Inactive,93,24.275605
4,72,Female,105,65,99,2,0,Active,150,17.472721


In [29]:
#rows, columns of table
healthScores.shape

(4999, 10)

In [30]:
#Convert columns containing non-numerical data into numerical data so they can be processed by the random forest model

#Sex: Male = 0, Female = 1
#Health Score (high is good): Inactive = 0, Active = 1, Very active = 2

healthScores['Sex'] = healthScores['Sex'].map({'Male': 0, 'Female': 1})
healthScores['Active'] = healthScores['Active'].map({'Inactive': 0, 'Active': 1, 'Very Active': 2})

#Also, normalise the data in each column to improve results

min_max_scaler = preprocessing.MinMaxScaler()

columns = healthScores.columns
#columns = columns.delete(8)

#Formulae for normalising and un-normalising values:
#normalisedVal = (x - min) / (max - min)
#x = (normalisedVal * (max - min)) + min

minHealthScore = healthScores['Health Score (high is good)'].min()
maxHealthScore = healthScores['Health Score (high is good)'].max()

#for i in range(0, len(columns)):
 #   x_scaled = min_max_scaler.fit_transform(healthScores[[columns[i]]].values.astype(float))
 #   healthScores[columns[i]] = pd.DataFrame(x_scaled)
    
healthScores.head()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI
0,32,1,101,67,110,0,0,0,251,15.818661
1,47,1,138,67,84,2,0,1,193,21.613616
2,25,0,166,73,106,2,5,0,167,21.900817
3,75,0,184,73,99,2,0,0,93,24.275605
4,72,1,105,65,99,2,0,1,150,17.472721


In [31]:
from sklearn.model_selection import train_test_split

In [32]:
#create a new data frame (x) from the old one, but drop the health score column
#create a new data frame (y) from the health score column

x = healthScores.drop('Health Score (high is good)', axis=1)
y = healthScores['Health Score (high is good)']

y

0       251
1       193
2       167
3        93
4       150
5       120
6       167
7        90
8       181
9       242
10      217
11      124
12      139
13      139
14      264
15      294
16       96
17      109
18       89
19      187
20      183
21       81
22      286
23      109
24      117
25      150
26      137
27      279
28      210
29      127
       ... 
4969    183
4970    228
4971    241
4972     85
4973     94
4974    180
4975    184
4976    137
4977    143
4978     97
4979    177
4980    135
4981    179
4982    156
4983    132
4984    121
4985    130
4986    267
4987    273
4988    190
4989    176
4990    130
4991    109
4992    162
4993    101
4994    133
4995    100
4996    258
4997    125
4998    378
Name: Health Score (high is good), Length: 4999, dtype: int64

In [33]:
x.head()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,BMI
0,32,1,101,67,110,0,0,0,15.818661
1,47,1,138,67,84,2,0,1,21.613616
2,25,0,166,73,106,2,5,0,21.900817
3,75,0,184,73,99,2,0,0,24.275605
4,72,1,105,65,99,2,0,1,17.472721


In [34]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [36]:
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [37]:
x_train

array([[ 0.22193333,  0.78393649, -1.2830367 , ..., -0.41264493,
         0.42434502, -1.15948915],
       [ 1.04188082,  0.78393649, -0.96707354, ...,  3.60445376,
         0.42434502, -1.2611544 ],
       [-0.32469833, -1.27561354,  1.91609025, ..., -0.41264493,
         1.74566517,  1.82366074],
       ...,
       [ 1.69783881,  0.78393649, -1.16455052, ..., -0.41264493,
        -0.89697514,  0.40725153],
       [ 1.69783881,  0.78393649, -1.40152288, ..., -0.41264493,
        -0.89697514, -2.28383396],
       [-1.36329848, -1.27561354,  0.8497146 , ..., -0.41264493,
         1.74566517, -0.31128721]])

In [38]:
from sklearn.neural_network import MLPRegressor

len(x_train.transpose())

9

In [39]:
#1 hidden layer with 100 nodes, maximum training iterations allowed = 1500
mlp = MLPRegressor(hidden_layer_sizes=(100, ), max_iter=1500)
mlp.fit(x_train, y_train)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [40]:
predictions = mlp.predict(x_test)

In [41]:
#from sklearn.metrics import classification_report, confusion_matrix

In [42]:
#pd.crosstab(y_test, predictions, rownames=['True'], colnames=['Predicted'], margins=True)

In [43]:
#print(classification_report(y_test, predictions))

In [44]:
y_test

4585    228
1593    137
562      82
825     134
3777    157
446     136
3889    232
3442    439
2016    151
1161    152
4430    145
979     138
3925    186
3199    163
214      89
2585    137
4910    134
4691    160
1091    154
453     161
2009    113
1515    231
883     149
138     111
4439    159
817      95
4717    136
2075    133
1011    174
2284     94
       ... 
1220    198
4823    335
3104    172
462      89
2171    120
4716    283
2357    129
2964    154
4847    154
3953    136
4426    151
1738    389
553     118
2826    116
4934    111
1281    114
4485     99
4256    408
2613     95
2451    209
286     120
465     118
3076    204
37      115
3079    139
2285    128
3397    125
4819    118
1202    107
3756    115
Name: Health Score (high is good), Length: 1250, dtype: int64

In [45]:
predictions

array([242.49641048, 115.82130058,  80.08368756, ..., 118.18776187,
       103.19073626, 117.14706824])

In [46]:
#Determine number of accurate predictions - a prediction is deemed to be accurate if it is within 10% of the true health score

actualValues = y_test.values
totalNumValues = len(y_test)

numAccurateResults = 0

for i in range(0, len(predictions)):
    if abs(predictions[i] - actualValues[i]) < (0.1 * healthScores['Health Score (high is good)'].max()):
        numAccurateResults += 1
    
percentAccurateResults = (numAccurateResults / totalNumValues) * 100
percentAccurateResults

99.44

In [47]:
#mlp.score(predictions, actualValues)

In [48]:
#Load in and prepare data for 20 individuals for which health score predictions will be made
#Columns need to be made to match healthScores data, and BMI attribute needs to be added

population = pd.read_csv('Population.csv')

#Add a column (feature) to the health score data set for BMI (BMI is a value calculated from weight and height)
temp = population.copy()

#convert weights from lbs to kg
temp['Weight in kg'] = temp['Weight  in lbs'] * 0.453592

#convert heights from inches to metres
temp['Height in m'] = temp['Height in Inch'] * 0.0254

#calculate BMIs and store them in a new column titled "BMI"
population['BMI'] = temp['Weight in kg'] / (temp['Height in m'] **2)

population = population.drop('Person Id', 1)
population = population.drop('Health Score (high is good)', 1)

In [49]:
#Convert columns containing non-numerical data into numerical data so they can be processed by the random forest model

#Sex: Male = 0, Female = 1
#Health Score (high is good): Inactive = 0, Active = 1, Very active = 2

population['Sex'] = population['Sex'].map({'Male': 0, 'Female': 1})
population['Active'] = population['Active'].map({'Inactive': 0, 'Active': 1, 'Very Active': 2})

In [50]:
#Also, normalise the data in each column to improve results

#min_max_scaler = preprocessing.MinMaxScaler()

columns = population.columns
#columns = columns.delete(8)

#Formulae for normalising and un-normalising values:
#normalisedVal = (x - min) / (max - min)
#x = (normalisedVal * (max - min)) + min

#for i in range(0, len(columns)):
 #   x_scaled = min_max_scaler.fit_transform(population[[columns[i]]].values.astype(float))
 #   population[columns[i]] = pd.DataFrame(x_scaled)

inputs = scaler.transform(population)

In [51]:
#Perform the predictions for the 20 individuals with unknown health scores

actualPredictions = mlp.predict(inputs)
actualPredictions

array([122.15031849, 241.66058012, 134.56091708, 257.33488732,
       161.3497996 , 105.83044711, 206.33660161, 143.542603  ,
       187.54844253, 240.78067458,  96.43946583, 181.24663692,
       115.13143664, 364.43637397, 298.99245853, 146.90654987,
       154.52892333,  75.65797156, 118.95438172, 309.13987183])

In [52]:
population['Predicted health scores'] = actualPredictions
population

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,BMI,Predicted health scores
0,65,0,191,69,104,0,0,1,28.205457,122.150318
1,46,1,99,66,103,0,0,1,15.978841,241.66058
2,51,1,135,71,105,1,0,0,18.82847,134.560917
3,37,1,126,66,96,1,0,1,20.336707,257.334887
4,77,1,114,67,99,0,0,1,17.854726,161.3498
5,79,1,100,69,97,1,0,0,14.767255,105.830447
6,51,1,119,67,103,1,0,1,18.637828,206.336602
7,41,0,177,70,103,5,0,1,25.396574,143.542603
8,33,0,185,68,94,0,0,1,28.128842,187.548443
9,56,1,105,64,95,0,0,2,18.023009,240.780675
