In [30]:
import pandas as pd
import numpy as np
from pycaret.regression import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import ExtraTreesRegressor

In [31]:
df = pd.read_csv("./bmi_data.csv")

In [32]:
df

Unnamed: 0,Sex,Age,Height(Inches),Weight(Pounds),BMI
0,Female,21,65.78331,112.9925,18.357646
1,Female,35,71.51521,136.4873,18.762652
2,Female,27,69.39874,153.0269,22.338985
3,Male,24,68.21660,142.3354,21.504612
4,Female,18,67.78781,144.2971,22.077669
...,...,...,...,...,...
24995,Male,35,69.50215,118.0312,17.179051
24996,Male,26,64.54826,120.1932,20.281947
24997,Female,23,64.69855,118.2655,19.864050
24998,Male,20,67.52918,132.2682,20.392540


In [33]:
df.isnull().sum()

Sex                0
Age                0
Height(Inches)    19
Weight(Pounds)    16
BMI               50
dtype: int64

In [34]:
df = df.dropna()
df.reset_index(inplace=True,drop=True)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24950 entries, 0 to 24949
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             24950 non-null  object 
 1   Age             24950 non-null  int64  
 2   Height(Inches)  24950 non-null  float64
 3   Weight(Pounds)  24950 non-null  float64
 4   BMI             24950 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 974.7+ KB


In [36]:
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
t = (df.dtypes == "object")
object_cols = list(t[t].index)

for i in object_cols:
    df[i] = label_encoder.fit_transform(df[i])
df

Unnamed: 0,Sex,Age,Height(Inches),Weight(Pounds),BMI
0,0,21,65.78331,112.9925,18.357646
1,0,35,71.51521,136.4873,18.762652
2,0,27,69.39874,153.0269,22.338985
3,1,24,68.21660,142.3354,21.504612
4,0,18,67.78781,144.2971,22.077669
...,...,...,...,...,...
24945,1,35,69.50215,118.0312,17.179051
24946,1,26,64.54826,120.1932,20.281947
24947,0,23,64.69855,118.2655,19.864050
24948,1,20,67.52918,132.2682,20.392540


In [37]:
df["Height(Inches)"] = df["Height(Inches)"].apply(lambda hei:hei*2.54)
df["Weight(Pounds)"] = df["Weight(Pounds)"].apply(lambda wei:wei/2.205)

In [38]:
df.rename(columns = {"Height(Inches)":"Height","Weight(Pounds)":"Weight"},inplace = True)
df

Unnamed: 0,Sex,Age,Height,Weight,BMI
0,0,21,167.089607,51.243764,18.357646
1,0,35,181.648633,61.899002,18.762652
2,0,27,176.272800,69.399955,22.338985
3,1,24,173.270164,64.551202,21.504612
4,0,18,172.181037,65.440862,22.077669
...,...,...,...,...,...
24945,1,35,176.535461,53.528889,17.179051
24946,1,26,163.952580,54.509388,20.281947
24947,0,23,164.334317,53.635147,19.864050
24948,1,20,171.524117,59.985578,20.392540


In [39]:
X = df.drop("BMI",axis = 1)
Y = df["BMI"]

In [40]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=27,test_size = 0.2)

In [41]:
reg = ExtraTreesRegressor()
reg.fit(X_train,Y_train)

In [42]:
Y_pred = reg.predict(X_test)
Y_pred

array([18.41773615, 18.40503179, 17.82148122, ..., 15.89173015,
       18.09655906, 18.36460605])

In [43]:
r2_score(Y_pred,Y_test)

0.9996513450306402