In [1]:
from sklearn.ensemble import RandomForestRegressor
#from sklearn.metrics import f1_score
from sklearn import preprocessing
import pandas as pd
import numpy as np
np.random.seed(0)

In [2]:
#Read in health score data
healthScores = pd.read_csv('HealthScores.csv')

#Add a column (feature) to the health score data set for BMI (BMI is a value calculated from weight and height)

#BMI = weight (in kg) / (height (in m))^2
#See https://www.nhs.uk/chq/Pages/how-can-i-work-out-my-bmi.aspx

#copy healthScores data frame into a temporary data frame
temp = healthScores.copy()

#convert weights from lbs to kg
temp['Weight in kg'] = temp['Weight  in lbs'] * 0.453592

#convert heights from inches to metres
temp['Height in m'] = temp['Height in Inch'] * 0.0254

#calculate BMIs and store them in a new column titled "BMI"
healthScores['BMI'] = temp['Weight in kg'] / (temp['Height in m'] **2)

healthScores.describe()

Unnamed: 0,Age,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Health Score (high is good),BMI
count,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0
mean,48.881376,142.273655,68.215043,100.056611,0.988198,4.001,170.024805,21.482319
std,18.209711,25.283687,4.228134,6.016367,1.003226,9.587125,70.025921,3.216895
min,18.0,77.0,56.0,79.0,0.0,0.0,59.0,10.878671
25%,33.0,123.0,65.0,96.0,0.0,0.0,117.0,19.197431
50%,49.0,139.0,68.0,100.0,1.0,0.0,156.0,21.433951
75%,65.0,163.0,71.0,104.0,2.0,0.0,207.0,23.63093
max,80.0,217.0,83.0,122.0,7.0,40.0,505.0,33.527565


In [3]:
#Convert columns containing non-numerical data into numerical data so they can be processed by the random forest model

#Sex: Male = 0, Female = 1
#Health Score (high is good): Inactive = 0, Active = 1, Very active = 2

healthScores['Sex'] = healthScores['Sex'].map({'Male': 0, 'Female': 1})
healthScores['Active'] = healthScores['Active'].map({'Inactive': 0, 'Active': 1, 'Very Active': 2})

#Also, normalise the data in each column to improve results

min_max_scaler = preprocessing.MinMaxScaler()

columns = healthScores.columns
#columns = columns.delete(8)

#Formulae for normalising and un-normalising values:
#normalisedVal = (x - min) / (max - min)
#x = (normalisedVal * (max - min)) + min

minHealthScore = healthScores['Health Score (high is good)'].min()
maxHealthScore = healthScores['Health Score (high is good)'].max()

for i in range(0, len(columns)):
    x_scaled = min_max_scaler.fit_transform(healthScores[[columns[i]]].values.astype(float))
    healthScores[columns[i]] = pd.DataFrame(x_scaled)

In [4]:
healthScores.head()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI
0,0.225806,1.0,0.171429,0.407407,0.72093,0.0,0.0,0.0,0.430493,0.218112
1,0.467742,1.0,0.435714,0.407407,0.116279,0.285714,0.0,0.5,0.300448,0.473972
2,0.112903,0.0,0.635714,0.62963,0.627907,0.285714,0.125,0.0,0.242152,0.486653
3,0.919355,0.0,0.764286,0.62963,0.465116,0.285714,0.0,0.0,0.076233,0.591505
4,0.870968,1.0,0.2,0.333333,0.465116,0.285714,0.0,0.5,0.204036,0.291142


In [5]:
healthScores.tail()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI
4994,0.612903,1.0,0.0,0.296296,0.44186,0.428571,0.0,0.0,0.165919,0.103237
4995,0.322581,0.0,0.528571,0.518519,0.55814,0.714286,0.0,0.0,0.091928,0.476285
4996,0.435484,1.0,0.414286,0.222222,0.465116,0.0,0.0,1.0,0.446188,0.60987
4997,0.870968,1.0,0.378571,0.222222,0.534884,0.0,0.0,0.0,0.147982,0.569493
4998,0.129032,1.0,0.464286,0.444444,0.465116,0.0,0.0,1.0,0.715247,0.472964


In [6]:
healthScores['is_train'] = np.random.uniform(0, 1, len(healthScores)) <= .75
healthScores.head(50)

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI,is_train
0,0.225806,1.0,0.171429,0.407407,0.72093,0.0,0.0,0.0,0.430493,0.218112,True
1,0.467742,1.0,0.435714,0.407407,0.116279,0.285714,0.0,0.5,0.300448,0.473972,True
2,0.112903,0.0,0.635714,0.62963,0.627907,0.285714,0.125,0.0,0.242152,0.486653,True
3,0.919355,0.0,0.764286,0.62963,0.465116,0.285714,0.0,0.0,0.076233,0.591505,True
4,0.870968,1.0,0.2,0.333333,0.465116,0.285714,0.0,0.5,0.204036,0.291142,True
5,0.66129,1.0,0.4,0.222222,0.651163,0.0,0.0,0.0,0.136771,0.593719,True
6,0.209677,1.0,0.607143,0.333333,0.372093,0.142857,0.0,0.0,0.242152,0.709935,True
7,0.870968,0.0,0.642857,0.518519,0.348837,0.142857,0.0,0.0,0.069507,0.577647,False
8,0.564516,1.0,0.535714,0.37037,0.465116,0.0,0.0,0.5,0.273543,0.602877,False
9,0.112903,0.0,0.714286,0.555556,0.27907,0.142857,0.0,1.0,0.410314,0.609634,True


In [7]:
train, test = healthScores[healthScores['is_train'] == True], healthScores[healthScores['is_train'] == False]

In [8]:
train

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI,is_train
0,0.225806,1.0,0.171429,0.407407,0.720930,0.000000,0.000,0.0,0.430493,0.218112,True
1,0.467742,1.0,0.435714,0.407407,0.116279,0.285714,0.000,0.5,0.300448,0.473972,True
2,0.112903,0.0,0.635714,0.629630,0.627907,0.285714,0.125,0.0,0.242152,0.486653,True
3,0.919355,0.0,0.764286,0.629630,0.465116,0.285714,0.000,0.0,0.076233,0.591505,True
4,0.870968,1.0,0.200000,0.333333,0.465116,0.285714,0.000,0.5,0.204036,0.291142,True
5,0.661290,1.0,0.400000,0.222222,0.651163,0.000000,0.000,0.0,0.136771,0.593719,True
6,0.209677,1.0,0.607143,0.333333,0.372093,0.142857,0.000,0.0,0.242152,0.709935,True
9,0.112903,0.0,0.714286,0.555556,0.279070,0.142857,0.000,1.0,0.410314,0.609634,True
11,0.935484,1.0,0.464286,0.296296,0.302326,0.000000,0.725,0.5,0.145740,0.595848,True
12,0.193548,1.0,0.500000,0.333333,0.511628,0.142857,0.750,0.0,0.179372,0.599726,True


In [9]:
test

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI,is_train
7,0.870968,0.0,0.642857,0.518519,0.348837,0.142857,0.000,0.0,0.069507,0.577647,False
8,0.564516,1.0,0.535714,0.370370,0.465116,0.000000,0.000,0.5,0.273543,0.602877,False
10,0.596774,1.0,0.335714,0.444444,0.627907,0.000000,0.000,0.5,0.354260,0.352126,False
13,0.774194,1.0,0.314286,0.333333,0.627907,0.000000,0.000,0.0,0.179372,0.408698,False
17,0.467742,1.0,0.364286,0.185185,0.558140,0.142857,0.000,0.0,0.112108,0.587510,False
18,0.838710,0.0,0.742857,0.666667,0.441860,0.285714,0.000,0.0,0.067265,0.545726,False
19,0.548387,1.0,0.342857,0.333333,0.418605,0.142857,0.000,0.5,0.286996,0.438087,False
20,0.725806,1.0,0.457143,0.296296,0.279070,0.000000,0.000,0.5,0.278027,0.588270,False
21,0.500000,0.0,0.742857,0.555556,0.232558,0.285714,0.725,0.0,0.049327,0.634266,False
23,0.387097,1.0,0.421429,0.111111,0.558140,0.285714,0.000,0.0,0.112108,0.732473,False


In [10]:
train.shape

(3763, 11)

In [11]:
test.shape

(1236, 11)

In [12]:
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:', len(test))

Number of observations in the training data: 3763
Number of observations in the test data: 1236


In [13]:
#Select features to use for training the model (all bar "health score" and "is_train" columns)
features = healthScores.columns[0:-1]
features = features.delete(8)
features

Index(['Age', 'Sex', 'Weight  in lbs', 'Height in Inch', 'IQ',
       'Units of alcohol per day', 'Cigarettes per day', 'Active', 'BMI'],
      dtype='object')

In [14]:
y = train['Health Score (high is good)']
y

0       0.430493
1       0.300448
2       0.242152
3       0.076233
4       0.204036
5       0.136771
6       0.242152
9       0.410314
11      0.145740
12      0.179372
14      0.459641
15      0.526906
16      0.082960
22      0.508969
24      0.130045
25      0.204036
26      0.174888
28      0.338565
29      0.152466
30      0.107623
32      0.130045
33      0.143498
34      0.251121
35      0.302691
36      0.065022
37      0.125561
39      0.396861
40      0.100897
41      0.096413
42      0.170404
          ...   
4961    0.123318
4962    0.087444
4964    0.112108
4965    0.170404
4966    0.390135
4967    0.147982
4968    0.278027
4969    0.278027
4970    0.378924
4971    0.408072
4974    0.271300
4975    0.280269
4977    0.188341
4978    0.085202
4981    0.269058
4982    0.217489
4984    0.139013
4985    0.159193
4986    0.466368
4988    0.293722
4989    0.262332
4990    0.159193
4991    0.112108
4992    0.230942
4993    0.094170
4994    0.165919
4995    0.091928
4996    0.4461

In [15]:
RFModel = RandomForestRegressor(n_jobs=2, random_state=0)

RFModel.fit(train[features], y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [16]:
RFModel.predict(test[features])

array([0.05313901, 0.27713004, 0.30426009, ..., 0.25717489, 0.16524664,
       0.45627803])

In [17]:
features

Index(['Age', 'Sex', 'Weight  in lbs', 'Height in Inch', 'IQ',
       'Units of alcohol per day', 'Cigarettes per day', 'Active', 'BMI'],
      dtype='object')

In [18]:
test

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI,is_train
7,0.870968,0.0,0.642857,0.518519,0.348837,0.142857,0.000,0.0,0.069507,0.577647,False
8,0.564516,1.0,0.535714,0.370370,0.465116,0.000000,0.000,0.5,0.273543,0.602877,False
10,0.596774,1.0,0.335714,0.444444,0.627907,0.000000,0.000,0.5,0.354260,0.352126,False
13,0.774194,1.0,0.314286,0.333333,0.627907,0.000000,0.000,0.0,0.179372,0.408698,False
17,0.467742,1.0,0.364286,0.185185,0.558140,0.142857,0.000,0.0,0.112108,0.587510,False
18,0.838710,0.0,0.742857,0.666667,0.441860,0.285714,0.000,0.0,0.067265,0.545726,False
19,0.548387,1.0,0.342857,0.333333,0.418605,0.142857,0.000,0.5,0.286996,0.438087,False
20,0.725806,1.0,0.457143,0.296296,0.279070,0.000000,0.000,0.5,0.278027,0.588270,False
21,0.500000,0.0,0.742857,0.555556,0.232558,0.285714,0.725,0.0,0.049327,0.634266,False
23,0.387097,1.0,0.421429,0.111111,0.558140,0.285714,0.000,0.0,0.112108,0.732473,False


In [19]:
preds = RFModel.predict(test[features])

In [20]:
preds

array([0.05313901, 0.27713004, 0.30426009, ..., 0.25717489, 0.16524664,
       0.45627803])

In [21]:
#pd.crosstab(test['Health Score (high is good)'], preds, rownames=['Actual health score'], colnames=['Predicted health score'])

In [22]:
#Determine number of accurate predictions - a prediction is deemed to be accurate if it is within 10% of the true health score

actualValues = test['Health Score (high is good)'].values
totalNumValues = len(test)

numAccurateResults = 0

for i in range(0, len(preds)):
    if abs(preds[i] - actualValues[i]) < (0.1 * healthScores['Health Score (high is good)'].max()):
        numAccurateResults += 1
        
percentAccurateResults = (numAccurateResults / totalNumValues) * 100
percentAccurateResults

95.63106796116504

In [23]:
list(zip(train[features], RFModel.feature_importances_))

[('Age', 0.4329210433701089),
 ('Sex', 0.11383291267302029),
 ('Weight  in lbs', 0.024064325007814132),
 ('Height in Inch', 0.009580099071805978),
 ('IQ', 0.01196722833842886),
 ('Units of alcohol per day', 0.011848794075056573),
 ('Cigarettes per day', 0.08176462144318318),
 ('Active', 0.2841385612321855),
 ('BMI', 0.02988241478839649)]

In [24]:
#Load in and prepare data for 20 individuals for which health score predictions will be made
#Columns need to be made to match healthScores data, and BMI attribute needs to be added

population = pd.read_csv('Population.csv')

#Add a column (feature) to the health score data set for BMI (BMI is a value calculated from weight and height)
temp = population.copy()

#convert weights from lbs to kg
temp['Weight in kg'] = temp['Weight  in lbs'] * 0.453592

#convert heights from inches to metres
temp['Height in m'] = temp['Height in Inch'] * 0.0254

#calculate BMIs and store them in a new column titled "BMI"
population['BMI'] = temp['Weight in kg'] / (temp['Height in m'] **2)

population = population.drop('Person Id', 1)
population = population.drop('Health Score (high is good)', 1)

In [25]:
#Convert columns containing non-numerical data into numerical data so they can be processed by the random forest model

#Sex: Male = 0, Female = 1
#Health Score (high is good): Inactive = 0, Active = 1, Very active = 2

population['Sex'] = population['Sex'].map({'Male': 0, 'Female': 1})
population['Active'] = population['Active'].map({'Inactive': 0, 'Active': 1, 'Very Active': 2})

In [26]:
#Also, normalise the data in each column to improve results

min_max_scaler = preprocessing.MinMaxScaler()

columns = population.columns
#columns = columns.delete(8)

#Formulae for normalising and un-normalising values:
#normalisedVal = (x - min) / (max - min)
#x = (normalisedVal * (max - min)) + min

for i in range(0, len(columns)):
    x_scaled = min_max_scaler.fit_transform(population[[columns[i]]].values.astype(float))
    population[columns[i]] = pd.DataFrame(x_scaled)

population

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,BMI
0,0.770492,0.0,1.0,0.454545,0.842105,0.0,0.0,0.5,1.0
1,0.459016,1.0,0.010753,0.181818,0.789474,0.0,0.0,0.5,0.09016
2,0.540984,1.0,0.397849,0.636364,0.894737,0.2,0.0,0.0,0.302214
3,0.311475,1.0,0.301075,0.181818,0.421053,0.2,0.0,0.5,0.414449
4,0.967213,1.0,0.172043,0.272727,0.578947,0.0,0.0,0.5,0.229753
5,1.0,1.0,0.021505,0.454545,0.473684,0.2,0.0,0.0,0.0
6,0.540984,1.0,0.225806,0.272727,0.789474,0.2,0.0,0.5,0.288028
7,0.377049,0.0,0.849462,0.545455,0.789474,1.0,0.0,0.5,0.790978
8,0.245902,0.0,0.935484,0.363636,0.315789,0.0,0.0,0.5,0.994299
9,0.622951,1.0,0.075269,0.0,0.368421,0.0,0.0,1.0,0.242276


In [27]:
#Perform the predictions for the 20 individuals with unknown health scores, and un-normalise the results

actualPredictions = RFModel.predict(population[features])

#Formulae for normalising and un-normalising values:
#normalisedVal = (x - min) / (max - min)
#x = (normalisedVal * (max - min)) + min

for i in range(0, len(actualPredictions)):
    actualPredictions[i] = (actualPredictions[i] * (maxHealthScore - minHealthScore)) + minHealthScore
    
actualPredictions

array([125.2, 238.8, 149.5, 244.1, 153.9,  99.2, 212.1, 150.2, 185.6,
       237.4,  87.7, 166.9, 113.4, 368.8, 279.1, 133.3, 154.2,  64.9,
       105. , 313.2])

In [28]:
population['Predicted health scores'] = actualPredictions
population

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,BMI,Predicted health scores
0,0.770492,0.0,1.0,0.454545,0.842105,0.0,0.0,0.5,1.0,125.2
1,0.459016,1.0,0.010753,0.181818,0.789474,0.0,0.0,0.5,0.09016,238.8
2,0.540984,1.0,0.397849,0.636364,0.894737,0.2,0.0,0.0,0.302214,149.5
3,0.311475,1.0,0.301075,0.181818,0.421053,0.2,0.0,0.5,0.414449,244.1
4,0.967213,1.0,0.172043,0.272727,0.578947,0.0,0.0,0.5,0.229753,153.9
5,1.0,1.0,0.021505,0.454545,0.473684,0.2,0.0,0.0,0.0,99.2
6,0.540984,1.0,0.225806,0.272727,0.789474,0.2,0.0,0.5,0.288028,212.1
7,0.377049,0.0,0.849462,0.545455,0.789474,1.0,0.0,0.5,0.790978,150.2
8,0.245902,0.0,0.935484,0.363636,0.315789,0.0,0.0,0.5,0.994299,185.6
9,0.622951,1.0,0.075269,0.0,0.368421,0.0,0.0,1.0,0.242276,237.4


In [29]:
#preds.reshape(1, -1)
#actualValues.reshape(1, -1)
#RFModel.score(preds, actualValues)