In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [2]:
pip install pandasql

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the BMI dataset

BMI_Data = pd.read_csv(r"C:\Users\badda\Downloads\BMI_Analysis_V1.1 (1).csv", header=0)

# Copy the file to back-up file

BMI_Data_bk = BMI_Data.copy()

# Display first 5 records

BMI_Data.head()

Unnamed: 0,Height_M,Weight_kg,Fat_age,BMI
0,1.6,49.44,23.9,19.31
1,1.65,62.6,28.8,22.96
2,1.65,75.75,32.4,27.79
3,1.53,48.99,25.8,20.92
4,1.45,43.09,22.5,20.38


In [4]:
# Display last 5 records

BMI_Data.tail()

Unnamed: 0,Height_M,Weight_kg,Fat_age,BMI
87,1.49,39.01,23.3,17.52
88,1.6,41.28,20.1,16.12
89,1.42,38.1,30.3,18.83
90,1.4,30.16,20.6,15.46
91,1.45,38.56,26.0,18.39


In [5]:
# display the records randomly

BMI_Data.sample(5)

Unnamed: 0,Height_M,Weight_kg,Fat_age,BMI
75,1.5,46.95,30.2,20.73
74,1.42,38.78,26.9,19.17
15,1.52,62.37,37.9,26.85
9,1.48,44.45,26.4,20.31
63,1.65,81.99,35.9,30.08


In [6]:
# Display the size of the dataset

BMI_Data.shape

(92, 4)

In [7]:
# Display the dataset information

BMI_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Height_M   92 non-null     float64
 1   Weight_kg  92 non-null     float64
 2   Fat_age    92 non-null     float64
 3   BMI        92 non-null     float64
dtypes: float64(4)
memory usage: 3.0 KB


In [8]:
# Display Descriptive Statistics

BMI_Data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Height_M,92.0,1.511522,0.074035,1.33,1.46,1.5,1.57,1.66
Weight_kg,92.0,47.865,13.077016,29.26,38.9525,45.475,52.705,87.54
Fat_age,92.0,26.961957,7.142888,15.6,21.075,25.5,31.975,46.0
BMI,92.0,20.705435,4.325066,14.99,17.7075,19.48,22.735,34.46


In [9]:
# Display the columns in BMI dataset

BMI_Data.columns

Index(['Height_M', 'Weight_kg', 'Fat_age', 'BMI'], dtype='object')

In [10]:
# Prepare cols1 for scaling     #scalling to make the variables in common scale/normalisation of data

cols1 = ['Height_M', 'Weight_kg', 'Fat_age']

In [11]:
# Identify the independent and Target variables

IndepVar = []
for col in BMI_Data.columns:
    if col != 'BMI':
        IndepVar.append(col)

TargetVar = 'BMI'

x = BMI_Data[IndepVar]
y = BMI_Data[TargetVar]

In [12]:
x.head()

Unnamed: 0,Height_M,Weight_kg,Fat_age
0,1.6,49.44,23.9
1,1.65,62.6,28.8
2,1.65,75.75,32.4
3,1.53,48.99,25.8
4,1.45,43.09,22.5


In [13]:
y.head()

0    19.31
1    22.96
2    27.79
3    20.92
4    20.38
Name: BMI, dtype: float64

In [14]:
# Split the data into train and test

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=143)

# Display the shape of the train_data and test_data

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((64, 3), (64,), (28, 3), (28,))

In [15]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))   #normalise the data in 0 to 1 range

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)
#cols1 = ['Height_M', 'Weight_kg', 'Fat_age']

In [16]:
x_train.head()

Unnamed: 0,Height_M,Weight_kg,Fat_age
64,0.580645,0.433669,0.680921
36,0.225806,0.014456,0.049342
14,0.354839,0.261299,0.203947
63,0.967742,0.964867,0.667763
54,0.935484,0.87978,0.875


In [17]:
x_test.head()

Unnamed: 0,Height_M,Weight_kg,Fat_age
41,0.363636,0.043569,0.026087
5,0.848485,0.391426,0.2
86,0.484848,0.199721,0.304348
30,0.393939,0.138376,0.008696
56,0.787879,1.0,0.891304


# Multiple Regression Algorithm

In [18]:
# Build the multi regression model

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()
#ModelMLR = LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None, positive=False)                   

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelMLR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
#print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)


Mean Absolute Error (MAE): 0.932
Mean Squared Error (MSE): 1.004
Root Mean Squared Error (RMSE): 1.002
R2_score: 0.943735
Mean Absolute Percentage Error (MAPE): 4.533 %
Adj R Square:  0.941817


In [19]:
Results = pd.DataFrame({'BMI_A':y_test, 'BMI_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = BMI_Data_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Height_M,Weight_kg,Fat_age,BMI,BMI_A,BMI_P
35,1.63,47.17,28.8,17.71,17.71,17.088247
65,1.59,61.23,33.0,24.3,24.3,23.320095
30,1.46,38.1,17.7,17.86,17.86,16.943678
66,1.53,73.37,40.5,31.33,31.33,29.450627
86,1.49,41.62,24.5,18.69,18.69,17.911498


In [20]:
# Compare with all Regression / Regressors

In [21]:
# Load the result dataset

RGRResults = pd.read_csv(r"C:\Users\badda\Downloads\RGRResults.csv", header=0)

RGRResults.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
