In [18]:
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt

In [19]:
#Read in health score data
healthScores = pd.read_csv('HealthScores.csv')

#Add a column (feature) to the health score data set for BMI (BMI is a value calculated from weight and height)

#BMI = weight (in kg) / (height (in m))^2
#See https://www.nhs.uk/chq/Pages/how-can-i-work-out-my-bmi.aspx

#copy healthScores data frame into a temporary data frame
temp = healthScores.copy()

#convert weights from lbs to kg
temp['Weight in kg'] = temp['Weight  in lbs'] * 0.453592

#convert heights from inches to metres
temp['Height in m'] = temp['Height in Inch'] * 0.0254

#calculate BMIs and store them in a new column titled "BMI"
healthScores['BMI'] = temp['Weight in kg'] / (temp['Height in m'] **2)

healthScores.head()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI
0,32,Female,101,67,110,0,0,Inactive,251,15.818661
1,47,Female,138,67,84,2,0,Active,193,21.613616
2,25,Male,166,73,106,2,5,Inactive,167,21.900817
3,75,Male,184,73,99,2,0,Inactive,93,24.275605
4,72,Female,105,65,99,2,0,Active,150,17.472721


In [20]:
#rows, columns of table
healthScores.shape

(4999, 10)

In [21]:
#Convert columns containing non-numerical data into numerical data so they can be processed by the random forest model

#Sex: Male = 0, Female = 1
#Health Score (high is good): Inactive = 0, Active = 1, Very active = 2

healthScores['Sex'] = healthScores['Sex'].map({'Male': 0, 'Female': 1})
healthScores['Active'] = healthScores['Active'].map({'Inactive': 0, 'Active': 1, 'Very Active': 2})

#Also, normalise the data in each column to improve results

min_max_scaler = preprocessing.MinMaxScaler()

columns = healthScores.columns
#columns = columns.delete(8)

#Formulae for normalising and un-normalising values:
#normalisedVal = (x - min) / (max - min)
#x = (normalisedVal * (max - min)) + min

#minHealthScore = healthScores['Health Score (high is good)'].min()
#maxHealthScore = healthScores['Health Score (high is good)'].max()

#for i in range(0, len(columns)):
    #x_scaled = min_max_scaler.fit_transform(healthScores[[columns[i]]].values.astype(float))
    #healthScores[columns[i]] = pd.DataFrame(x_scaled)
    
healthScores.head()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,Health Score (high is good),BMI
0,32,1,101,67,110,0,0,0,251,15.818661
1,47,1,138,67,84,2,0,1,193,21.613616
2,25,0,166,73,106,2,5,0,167,21.900817
3,75,0,184,73,99,2,0,0,93,24.275605
4,72,1,105,65,99,2,0,1,150,17.472721


In [22]:
#create a new data frame (x) from the old one, but drop the health score column
#create a new data frame (y) from the health score column

x = healthScores.drop('Health Score (high is good)', axis=1)
y = healthScores['Health Score (high is good)']

y

0       251
1       193
2       167
3        93
4       150
5       120
6       167
7        90
8       181
9       242
10      217
11      124
12      139
13      139
14      264
15      294
16       96
17      109
18       89
19      187
20      183
21       81
22      286
23      109
24      117
25      150
26      137
27      279
28      210
29      127
       ... 
4969    183
4970    228
4971    241
4972     85
4973     94
4974    180
4975    184
4976    137
4977    143
4978     97
4979    177
4980    135
4981    179
4982    156
4983    132
4984    121
4985    130
4986    267
4987    273
4988    190
4989    176
4990    130
4991    109
4992    162
4993    101
4994    133
4995    100
4996    258
4997    125
4998    378
Name: Health Score (high is good), Length: 4999, dtype: int64

In [23]:
x.head()

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,BMI
0,32,1,101,67,110,0,0,0,15.818661
1,47,1,138,67,84,2,0,1,21.613616
2,25,0,166,73,106,2,5,0,21.900817
3,75,0,184,73,99,2,0,0,24.275605
4,72,1,105,65,99,2,0,1,17.472721


In [24]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [25]:
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

x_train

array([[ 1.70051926,  0.7755336 ,  0.81954636, ..., -0.41618013,
        -0.89851619,  2.0282514 ],
       [-0.06367499,  0.7755336 , -0.2083428 , ..., -0.41618013,
        -0.89851619, -0.57246071],
       [-0.72524783,  0.7755336 , -1.82924493, ..., -0.41618013,
         0.44033007, -2.64689425],
       ...,
       [-0.50472355,  0.7755336 , -1.43390295, ..., -0.41618013,
         0.44033007, -1.96261353],
       [-0.55985462,  0.7755336 , -0.95949257, ..., -0.41618013,
        -0.89851619, -0.18326222],
       [-0.94577212,  0.7755336 , -0.72228738, ..., -0.41618013,
        -0.89851619, -1.15542802]])

In [26]:
reg = linear_model.LinearRegression()
reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
predictions = reg.predict(x_test)
predictions

array([143.35464229, 129.42002909, 174.20412097, ..., 294.9793056 ,
       162.3835891 ,  70.09234292])

In [28]:
#Determine number of accurate predictions - a prediction is deemed to be accurate if it is within 10% of the true health score

actualValues = y_test.values
totalNumValues = len(y_test)

numAccurateResults = 0

for i in range(0, len(predictions)):
    if abs(predictions[i] - actualValues[i]) < (0.1 * healthScores['Health Score (high is good)'].max()):
        numAccurateResults += 1
    
percentAccurateResults = (numAccurateResults / totalNumValues) * 100
percentAccurateResults

91.52

In [29]:
#Load in and prepare data for 20 individuals for which health score predictions will be made
#Columns need to be made to match healthScores data, and BMI attribute needs to be added

population = pd.read_csv('Population.csv')

#Add a column (feature) to the health score data set for BMI (BMI is a value calculated from weight and height)
temp = population.copy()

#convert weights from lbs to kg
temp['Weight in kg'] = temp['Weight  in lbs'] * 0.453592

#convert heights from inches to metres
temp['Height in m'] = temp['Height in Inch'] * 0.0254

#calculate BMIs and store them in a new column titled "BMI"
population['BMI'] = temp['Weight in kg'] / (temp['Height in m'] **2)

population = population.drop('Person Id', 1)
population = population.drop('Health Score (high is good)', 1)

In [30]:
#Convert columns containing non-numerical data into numerical data so they can be processed by the random forest model

#Sex: Male = 0, Female = 1
#Health Score (high is good): Inactive = 0, Active = 1, Very active = 2

population['Sex'] = population['Sex'].map({'Male': 0, 'Female': 1})
population['Active'] = population['Active'].map({'Inactive': 0, 'Active': 1, 'Very Active': 2})

In [31]:
#Also, normalise the data in each column to improve results

min_max_scaler = preprocessing.MinMaxScaler()

columns = population.columns

inputs = scaler.transform(population)
population

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,BMI
0,65,0,191,69,104,0,0,1,28.205457
1,46,1,99,66,103,0,0,1,15.978841
2,51,1,135,71,105,1,0,0,18.82847
3,37,1,126,66,96,1,0,1,20.336707
4,77,1,114,67,99,0,0,1,17.854726
5,79,1,100,69,97,1,0,0,14.767255
6,51,1,119,67,103,1,0,1,18.637828
7,41,0,177,70,103,5,0,1,25.396574
8,33,0,185,68,94,0,0,1,28.128842
9,56,1,105,64,95,0,0,2,18.023009


In [32]:
#Perform the predictions for the 20 individuals with unknown health scores

actualPredictions = reg.predict(inputs)
actualPredictions

array([120.9802755 , 235.13030534, 160.24533515, 238.95513606,
       157.54189   , 106.49477209, 209.6687886 , 152.34992402,
       197.802032  , 255.40433042,  89.4057399 , 182.69671924,
       126.60536721, 286.92627916, 280.09822621, 129.81112645,
       137.61711444,  28.46212125, 151.507361  , 260.16370232])

In [33]:
population['Predicted health scores'] = actualPredictions
population

Unnamed: 0,Age,Sex,Weight in lbs,Height in Inch,IQ,Units of alcohol per day,Cigarettes per day,Active,BMI,Predicted health scores
0,65,0,191,69,104,0,0,1,28.205457,120.980275
1,46,1,99,66,103,0,0,1,15.978841,235.130305
2,51,1,135,71,105,1,0,0,18.82847,160.245335
3,37,1,126,66,96,1,0,1,20.336707,238.955136
4,77,1,114,67,99,0,0,1,17.854726,157.54189
5,79,1,100,69,97,1,0,0,14.767255,106.494772
6,51,1,119,67,103,1,0,1,18.637828,209.668789
7,41,0,177,70,103,5,0,1,25.396574,152.349924
8,33,0,185,68,94,0,0,1,28.128842,197.802032
9,56,1,105,64,95,0,0,2,18.023009,255.40433


In [34]:
m = reg.coef_[0]
b = reg.intercept_

print("slope =", m, "intercept =", b)

slope = -42.66616159380895 intercept = 169.51507068551615
