In [2]:
# Data Rush: UNT's Ultimate Regression Challenge

import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("bodyfat-comp.csv")
df.head()

Unnamed: 0,Id,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,Person2,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
1,Person3,1.0414,25.3,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
2,Person4,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
3,Person6,1.0502,20.9,24,210.25,74.75,39.0,104.5,94.4,107.8,66.0,42.0,25.6,35.7,30.6,18.8
4,Person7,1.0549,19.2,26,181.0,69.75,36.4,105.1,90.7,100.3,58.4,38.3,22.9,31.9,27.8,17.7


In [4]:
df.isnull().sum()

Id         0
Density    0
BodyFat    0
Age        0
Weight     0
Height     0
Neck       0
Chest      0
Abdomen    0
Hip        0
Thigh      0
Knee       0
Ankle      0
Biceps     0
Forearm    0
Wrist      0
dtype: int64

In [5]:
df.isna().value_counts()

Id     Density  BodyFat  Age    Weight  Height  Neck   Chest  Abdomen  Hip    Thigh  Knee   Ankle  Biceps  Forearm  Wrist
False  False    False    False  False   False   False  False  False    False  False  False  False  False   False    False    168
dtype: int64

# Exploratory Data Analysis

In [6]:
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objs as go

In [7]:
fig = px.box(df, x="BodyFat", title="Distribution of BodyFat")

fig.update_traces(marker_color='black', marker_line_color='black')
fig.update_layout(plot_bgcolor='white')
fig.show()

In [8]:
# The data is with people between ages 22 and 72 and has an av.....
fig = px.box(df, x='Age', title='Age Distribution')
fig.update_traces(marker_color='black', marker_line_color='black')
fig.update_layout(plot_bgcolor='white')
fig.show()

In [9]:
# we have an outlier.....
fig = px.scatter(df, x='Height', y='Weight', title='Height vs. Weight', trendline= 'ols')

# Customize marker color
fig.update_traces(marker=dict(color='black'))

# Customize background color
fig.update_layout(plot_bgcolor='white')

# Customize marker size
fig.update_traces(marker=dict(size=10))
fig.show()

In [10]:
for i in ['Age', 'Height', 'Weight', 'Neck', 'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']:
    #fig = px.scatter(df, x=i, y='BodyFat', title= i + " vs Body Fat")
    #fig.add_trace(px.scatter(df, x= i, y="BodyFat").trendline(line=dict(color="red", width=3, dash="dash")))
    fig = px.scatter(df, x=i, y='BodyFat', trendline="ols")

    # Customize marker color
    fig.update_traces(marker=dict(color='black'))

    # Customize background color
    fig.update_layout(plot_bgcolor='white')
    #fig.add_trace(px.scatter(df, x=i, y="BodyFat").trendline)

    # Customize marker size
    fig.update_traces(marker=dict(size=10))
    fig.show()

In [11]:
for i in ['Age', 'Height', 'Weight', 'Neck', 'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']:
    fig = px.histogram(df, x = i)
    fig.update_traces(marker_color='black', marker_line_color='black')
    fig.update_layout(plot_bgcolor='white')
    # Show plot
    fig.show()

In [12]:
# Identify outliers based on Z-score method
z_scores = (df - df.mean()) / df.std()
outliers = z_scores.abs() > 3
df = df[~outliers.any(axis=1)]

# Identify outliers based on IQR method
q1, q3 = df.quantile(0.25), df.quantile(0.75)
iqr = q3 - q1
lower_bound, upper_bound = q1 - 1.5 * iqr, q3 + 1.5 * iqr
outliers = (df < lower_bound) | (df > upper_bound)
df = df[~outliers.any(axis=1)]

# Remove outliers based on visual inspection
#df = df[df['column'] < max_value]


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.


Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version.  Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`



In [13]:
for i in ['Age', 'Height', 'Weight', 'Neck', 'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist']:
    fig = px.histogram(df, x = i)
    fig.update_traces(marker_color='black', marker_line_color='black')
    fig.update_layout(plot_bgcolor='white')
    # Show plot
    fig.show()

# Feature Engineering 

In [14]:
# Feature Engineering (BMI)
df['Height_m'] = df['Height'] / 100

# calculate BMI
df['BMI'] = df['Weight'] / (df['Height_m'] ** 2)

In [15]:
#correlation matrix
corr_matrix = df.corr()
fig = go.Figure(
    data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.index,
        colorscale='gray',
        colorbar=dict(title='Correlation')
    )
)
fig.update_layout(
    title='Correlation Matrix',
    xaxis=dict(title='Attributes'),
    yaxis=dict(title='Attributes'),
    height = 1000,
    width = 1000
)
fig.show()


In [16]:
df.head()

Unnamed: 0,Id,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist,Height_m,BMI
0,Person2,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2,0.7225,331.892578
1,Person3,1.0414,25.3,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6,0.6625,350.872197
2,Person4,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2,0.7225,353.922965
3,Person6,1.0502,20.9,24,210.25,74.75,39.0,104.5,94.4,107.8,66.0,42.0,25.6,35.7,30.6,18.8,0.7475,376.282144
4,Person7,1.0549,19.2,26,181.0,69.75,36.4,105.1,90.7,100.3,58.4,38.3,22.9,31.9,27.8,17.7,0.6975,372.040441


# Raw Regression Moedels
-  Linear Regression
- Decision treeregression
- random forest regression


In [18]:
# Import required libraries
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Define a list of regression models to loop through
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]
X = df[['Age', 'Height', 'Weight', 'Neck', 'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist','BMI']]
y = df['BodyFat']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state = 1)

# Loop through each model and fit the training data
for model in models:
    model.fit(X_train, y_train)
    
    # Predict on the validation data and calculate RMSE
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    # Print the model name and RMSE
    print(type(model).__name__, "RMSE:", rmse)


LinearRegression RMSE: 4.5301645716492525
DecisionTreeRegressor RMSE: 6.907195873776106
RandomForestRegressor RMSE: 4.502543313542091


In [19]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error

lasso_reg = Lasso(alpha=0.1)
ridge_reg = Ridge(alpha=1)

# Create list of models
models = [lasso_reg, ridge_reg]

# Train and evaluate models
for model in models:
    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Predict on the validation data
    y_pred = model.predict(X_val)

    # Evaluate the model using RMSE
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(f'{model.__class__.__name__} RMSE: {rmse:.3f}')

Lasso RMSE: 4.365
Ridge RMSE: 4.513


# Scaling


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [21]:
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]
X = df[['Age', 'Height', 'Weight', 'Neck', 'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm', 'Wrist','BMI']]
y = df['BodyFat']

for model_1 in models:
    model_1.fit(X_train_scaled, y_train)
    
    # Predict on the validation data and calculate RMSE
    y_pred = model_1.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Print the model name and RMSE
    print(type(model_1).__name__, "RMSE:", rmse)


LinearRegression RMSE: 4.530164571649269
DecisionTreeRegressor RMSE: 6.975440788476982
RandomForestRegressor RMSE: 4.434576806372046


In [22]:
lasso_reg = Lasso(alpha=0.1)
ridge_reg = Ridge(alpha=1)

# Create list of models
models = [lasso_reg, ridge_reg]

# Train and evaluate models
for model_2 in models:
    # Fit the model to the training data
    model_2.fit(X_train_scaled, y_train)

    # Predict on the validation data
    y_pred = model_2.predict(X_test_scaled)

    # Evaluate the model using RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f'{model_2.__class__.__name__} RMSE: {rmse:.3f}')

Lasso RMSE: 4.213
Ridge RMSE: 4.398


In [23]:
# Removing some attributes
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# create a linear regression object
lr = LinearRegression()

for k in range (1, 14,1):
    
    # create the RFE model and select the top k attributes
    rfe = RFE(lr, n_features_to_select=k)
    rfe.fit(X_train, y_train)

    # transform the data to include only the selected attributes
    X_train_rfe = rfe.transform(X_train)
    X_val_rfe = rfe.transform(X_val)

    # train the model on the reduced set of attributes
    lr.fit(X_train_rfe, y_train)

    # evaluate the model on the validation set
    y_pred = lr.predict(X_val_rfe)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print('RMSE on validation set:', rmse)


RMSE on validation set: 6.957629485255731
RMSE on validation set: 4.506256830977455
RMSE on validation set: 4.5772136638668774
RMSE on validation set: 4.671898657593116
RMSE on validation set: 4.670168614624029
RMSE on validation set: 4.4838298347275645
RMSE on validation set: 4.43912561430811
RMSE on validation set: 4.533023049088705
RMSE on validation set: 4.556062755454569
RMSE on validation set: 4.539876603555159
RMSE on validation set: 4.515844609900709
RMSE on validation set: 4.505724345404238
RMSE on validation set: 4.543167396587774


In [24]:
'''For BMI, Height, Age, Ankle, Forearm, Wrist: LinearRegression RMSE: 4.472268805028441
DecisionTreeRegressor RMSE: 6.1685727946244295
RandomForestRegressor RMSE: 5.000366709133084'''

"""For BMI, Height, Ankle, Forearm, Wrist: LinearRegression RMSE: 4.418595462214984
DecisionTreeRegressor RMSE: 7.689582395641186
RandomForestRegressor RMSE: 5.307169940202926"""





'For BMI, Height, Ankle, Forearm, Wrist: LinearRegression RMSE: 4.418595462214984\nDecisionTreeRegressor RMSE: 7.689582395641186\nRandomForestRegressor RMSE: 5.307169940202926'

In [25]:
y = df['BodyFat']
X_corr = df.drop(['Id','Density', 'Neck', 'Chest', 'Abdomen','Hip','Thigh','Knee','Biceps','Height_m','BodyFat','Weight','Age'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X_corr, y, test_size = 0.2, random_state = 1)
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]
for model3 in models:
    model3.fit(X_train, y_train)
    
    # Predict on the validation data and calculate RMSE
    y_pred = model3.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Print the model name and RMSE
    print(type(model3).__name__, "RMSE:", rmse)

X_corr

LinearRegression RMSE: 4.179894701627197
DecisionTreeRegressor RMSE: 7.863061867713103
RandomForestRegressor RMSE: 4.853577330508477


Unnamed: 0,Height,Ankle,Forearm,Wrist,BMI
0,72.25,23.4,28.9,18.2,331.892578
1,66.25,24.0,25.2,16.6,350.872197
2,72.25,22.8,29.4,18.2,353.922965
3,74.75,25.6,30.6,18.8,376.282144
4,69.75,22.9,27.8,17.7,372.040441
...,...,...,...,...,...
161,65.75,24.0,30.5,19.1,438.924952
162,65.75,20.1,24.8,16.5,294.929810
164,69.25,22.6,27.3,18.5,324.258103
166,67.00,21.5,25.7,18.5,299.064380


In [26]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [27]:
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]

for model_3 in models:
    model_3.fit(X_train_scaled, y_train)
    
    # Predict on the validation data and calculate RMSE
    y_pred = model_3.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Print the model name and RMSE
    print(type(model_3).__name__, "RMSE:", rmse)


LinearRegression RMSE: 4.179894701627196
DecisionTreeRegressor RMSE: 7.802005532078316
RandomForestRegressor RMSE: 4.944271282618808


In [28]:
lasso_reg = Lasso(alpha=0.1)
ridge_reg = Ridge(alpha=1)

# Create list of models
models = [lasso_reg, ridge_reg]

# Train and evaluate models
for model_4 in models:
    # Fit the model to the training data
    model_4.fit(X_train_scaled, y_train)

    # Predict on the validation data
    y_pred = model_4.predict(X_test_scaled)

    # Evaluate the model using RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f'{model_4.__class__.__name__} RMSE: {rmse:.3f}')

Lasso RMSE: 4.230
Ridge RMSE: 4.193


In [29]:
# Best Model So far: Linear Regression with Scaling and with only few attributes (BMI, Height, Ankle, Forearm, Wrist)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_corr, y, test_size = 0.2, random_state = 1)
best_model = Ridge(alpha=0.1)
best_model.fit(X_train_scaled, y_train)
# Predict on the validation data and calculate RMSE
y_pred = best_model.predict(X_test_scaled)
rmse = mean_squared_error(y_test, y_pred, squared=False)
    
# Print the model name and RMSE
print(type(best_model).__name__, "RMSE:", rmse)

Ridge RMSE: 4.181137570252294


In [46]:
test_data = pd.read_csv('bodyfat-validate.csv')
test_data.head()

Unnamed: 0,Id,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,Person68,55,154.75,71.5,36.9,95.4,86.6,91.8,54.3,35.4,21.5,32.8,27.4,18.7
1,Person252,74,207.5,70.0,40.8,112.4,108.5,107.1,59.3,42.2,24.6,33.7,30.0,20.9
2,Person232,57,182.25,71.75,39.4,103.4,96.7,100.7,59.3,38.6,22.8,31.8,29.1,19.0
3,Person162,33,196.0,73.0,38.5,103.8,95.6,105.1,61.4,40.6,25.0,31.3,29.2,19.1
4,Person92,44,179.75,69.5,39.2,101.9,93.2,100.6,58.9,39.7,23.1,31.4,28.4,18.8


In [47]:
test_data['Height_m'] = test_data['Height'] / 100

# calculate BMI
test_data['BMI'] = test_data['Weight'] / (test_data['Height_m'] ** 2)

In [48]:
test_data.head()
X_test_final = test_data.drop(['Id','Age','Weight','Neck','Chest','Abdomen','Hip','Thigh','Knee','Biceps','Height_m'],axis = 1)

In [49]:
X_test_final_scaled = scaler.transform(X_test_final)

In [50]:
y_pred_final = best_model.predict(X_test_final_scaled)
pd.DataFrame(y_pred_final).head()

Unnamed: 0,0
0,11.418521
1,25.869357
2,17.945758
3,19.177391
4,21.160664


In [51]:
output = pd.DataFrame({'Id': test_data.Id, 'BodyFat': y_pred_final})
output.to_csv('predictions_Ridge.csv', index=False)