# Import libraries and data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv(r"D:\cources\2024\univisity_project\regression_data\Body Fat Prediction Dataset\bodyfat.csv")
df

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,1.0708,12.3,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,1.0414,25.3,22,154.00,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,1.0340,28.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,1.0736,11.0,70,134.25,67.00,34.9,89.2,83.6,88.8,49.6,34.8,21.5,25.6,25.7,18.5
248,1.0236,33.6,72,201.00,69.75,40.9,108.5,105.0,104.5,59.6,40.8,23.2,35.2,28.6,20.1
249,1.0328,29.3,72,186.75,66.00,38.9,111.1,111.5,101.7,60.3,37.3,21.5,31.3,27.2,18.0
250,1.0399,26.0,72,190.75,70.50,38.9,108.3,101.3,97.8,56.0,41.6,22.7,30.5,29.4,19.8


# Exploratory Data Analysis

In [4]:
df.shape

(252, 15)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Density  252 non-null    float64
 1   BodyFat  252 non-null    float64
 2   Age      252 non-null    int64  
 3   Weight   252 non-null    float64
 4   Height   252 non-null    float64
 5   Neck     252 non-null    float64
 6   Chest    252 non-null    float64
 7   Abdomen  252 non-null    float64
 8   Hip      252 non-null    float64
 9   Thigh    252 non-null    float64
 10  Knee     252 non-null    float64
 11  Ankle    252 non-null    float64
 12  Biceps   252 non-null    float64
 13  Forearm  252 non-null    float64
 14  Wrist    252 non-null    float64
dtypes: float64(14), int64(1)
memory usage: 29.7 KB


In [6]:
# Basic statistics of numeric columns
numeric_stats = df.describe()
numeric_stats

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,1.055574,19.150794,44.884921,178.924405,70.14881,37.992063,100.824206,92.555952,99.904762,59.405952,38.590476,23.102381,32.273413,28.663889,18.229762
std,0.019031,8.36874,12.60204,29.38916,3.662856,2.430913,8.430476,10.783077,7.164058,5.249952,2.411805,1.694893,3.021274,2.020691,0.933585
min,0.995,0.0,22.0,118.5,29.5,31.1,79.3,69.4,85.0,47.2,33.0,19.1,24.8,21.0,15.8
25%,1.0414,12.475,35.75,159.0,68.25,36.4,94.35,84.575,95.5,56.0,36.975,22.0,30.2,27.3,17.6
50%,1.0549,19.2,43.0,176.5,70.0,38.0,99.65,90.95,99.3,59.0,38.5,22.8,32.05,28.7,18.3
75%,1.0704,25.3,54.0,197.0,72.25,39.425,105.375,99.325,103.525,62.35,39.925,24.0,34.325,30.0,18.8
max,1.1089,47.5,81.0,363.15,77.75,51.2,136.2,148.1,147.7,87.3,49.1,33.9,45.0,34.9,21.4


In [7]:
# Columns with missing values and their counts
missing_values = df.isnull().sum()
print(missing_values)


Density    0
BodyFat    0
Age        0
Weight     0
Height     0
Neck       0
Chest      0
Abdomen    0
Hip        0
Thigh      0
Knee       0
Ankle      0
Biceps     0
Forearm    0
Wrist      0
dtype: int64


In [10]:
# Calculate the minimum and maximum values for each column
min_values = df.min()
max_values = df.max()

# Print the results
print("Minimum values:")
print(min_values)
print("\nMaximum values:")
print(max_values)

Minimum values:
Density      0.995
BodyFat      0.000
Age         22.000
Weight     118.500
Height      29.500
Neck        31.100
Chest       79.300
Abdomen     69.400
Hip         85.000
Thigh       47.200
Knee        33.000
Ankle       19.100
Biceps      24.800
Forearm     21.000
Wrist       15.800
dtype: float64

Maximum values:
Density      1.1089
BodyFat     47.5000
Age         81.0000
Weight     363.1500
Height      77.7500
Neck        51.2000
Chest      136.2000
Abdomen    148.1000
Hip        147.7000
Thigh       87.3000
Knee        49.1000
Ankle       33.9000
Biceps      45.0000
Forearm     34.9000
Wrist       21.4000
dtype: float64


In [8]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]

# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

Number of duplicate rows: 0


Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist


In [9]:
# Get the number of unique values in each column
unique_values_count = df.nunique()

unique_values_count

Density    218
BodyFat    176
Age         51
Weight     197
Height      48
Neck        90
Chest      174
Abdomen    185
Hip        152
Thigh      139
Knee        90
Ankle       61
Biceps     104
Forearm     77
Wrist       44
dtype: int64

# Supervised Models

In [12]:
column_names = df.columns
print(column_names)

Index(['Density', 'BodyFat', 'Age', 'Weight', 'Height', 'Neck', 'Chest',
       'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'Biceps', 'Forearm',
       'Wrist'],
      dtype='object')


### Random Forest Regressor

In [28]:
from sklearn.ensemble import RandomForestRegressor

# Create and train the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_rf = rf_model.predict(X_test)

# Model evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
r_squared_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - Mean Squared Error (MSE): {mse_rf}")
print(f"Random Forest - R-squared (R2): {r_squared_rf}")

# Train set predictions
y_train_pred_rf = rf_model.predict(X_train)
train_mse_rf = mean_squared_error(y_train, y_train_pred_rf)
train_r_squared_rf = r2_score(y_train, y_train_pred_rf)

# Validation set predictions
y_val_pred_rf = rf_model.predict(X_val)
val_mse_rf = mean_squared_error(y_val, y_val_pred_rf)
val_r_squared_rf = r2_score(y_val, y_val_pred_rf)

# Test set predictions
y_test_pred_rf = rf_model.predict(X_test)
test_mse_rf = mean_squared_error(y_test, y_test_pred_rf)
test_r_squared_rf = r2_score(y_test, y_test_pred_rf)

print("\nRandom Forest - Training Set:")
print(f"Mean Squared Error (MSE): {train_mse_rf}")
print(f"R-squared (R2): {train_r_squared_rf}")
print("\nRandom Forest - Validation Set:")
print(f"Mean Squared Error (MSE): {val_mse_rf}")
print(f"R-squared (R2): {val_r_squared_rf}")
print("\nRandom Forest - Test Set:")
print(f"Mean Squared Error (MSE): {test_mse_rf}")
print(f"R-squared (R2): {test_r_squared_rf}")

# Training Set Accuracy
training_accuracy_rf = train_r_squared_rf * 100
print("Random Forest - Training Set Accuracy: {:.2f}%".format(training_accuracy_rf))

# Validation Set Accuracy
validation_accuracy_rf = val_r_squared_rf * 100
print("Random Forest - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_rf))

# Test Set Accuracy
test_accuracy_rf = test_r_squared_rf * 100
print("Random Forest - Test Set Accuracy: {:.2f}%".format(test_accuracy_rf))


Random Forest - Mean Squared Error (MSE): 0.1283416078431395
Random Forest - R-squared (R2): 0.9972410375598688

Random Forest - Training Set:
Mean Squared Error (MSE): 0.5053235000000016
R-squared (R2): 0.9934852357155197

Random Forest - Validation Set:
Mean Squared Error (MSE): 0.35368247058823726
R-squared (R2): 0.9948329560384997

Random Forest - Test Set:
Mean Squared Error (MSE): 0.1283416078431395
R-squared (R2): 0.9972410375598688
Random Forest - Training Set Accuracy: 99.35%
Random Forest - Validation Set Accuracy: 99.48%
Random Forest - Test Set Accuracy: 99.72%


### Gradient Boosting Regressor

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

# Create and train the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_gb = gb_model.predict(X_test)

# Model evaluation
mse_gb = mean_squared_error(y_test, y_pred_gb)
r_squared_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting - Mean Squared Error (MSE): {mse_gb}")
print(f"Gradient Boosting - R-squared (R2): {r_squared_gb}")

# Train set predictions
y_train_pred = gb_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r_squared = r2_score(y_train, y_train_pred)

# Validation set predictions
y_val_pred = gb_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r_squared = r2_score(y_val, y_val_pred)

# Test set predictions
y_test_pred = gb_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r_squared = r2_score(y_test, y_test_pred)

print("Training Set:")
print(f"Mean Squared Error (MSE): {train_mse}")
print(f"R-squared (R2): {train_r_squared}")
print("\nValidation Set:")
print(f"Mean Squared Error (MSE): {val_mse}")
print(f"R-squared (R2): {val_r_squared}")
print("\nTest Set:")
print(f"Mean Squared Error (MSE): {test_mse}")
print(f"R-squared (R2): {test_r_squared}")

# Training Set Accuracy
training_accuracy = train_r_squared * 100
print("Training Set Accuracy: {:.2f}%".format(training_accuracy))

# Validation Set Accuracy
validation_accuracy = val_r_squared * 100
print("Validation Set Accuracy: {:.2f}%".format(validation_accuracy))

# Test Set Accuracy
test_accuracy = test_r_squared * 100
print("Test Set Accuracy: {:.2f}%".format(test_accuracy))


Gradient Boosting - Mean Squared Error (MSE): 0.2980091847993537
Gradient Boosting - R-squared (R2): 0.9935936898290972
Training Set:
Mean Squared Error (MSE): 0.005432102620788033
R-squared (R2): 0.9999299678955292

Validation Set:
Mean Squared Error (MSE): 0.898131644295757
R-squared (R2): 0.9868789491275255

Test Set:
Mean Squared Error (MSE): 0.2980091847993537
R-squared (R2): 0.9935936898290972
Training Set Accuracy: 99.99%
Validation Set Accuracy: 98.69%
Test Set Accuracy: 99.36%


### Support Vector Regression 

In [27]:
from sklearn.svm import SVR

# Create and train the Support Vector Machine Regressor
svm_model = SVR()
svm_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_svm = svm_model.predict(X_test)

# Model evaluation
mse_svm = mean_squared_error(y_test, y_pred_svm)
r_squared_svm = r2_score(y_test, y_pred_svm)

print(f"SVM - Mean Squared Error (MSE): {mse_svm}")
print(f"SVM - R-squared (R2): {r_squared_svm}")

# Train set predictions
y_train_pred_svm = svm_model.predict(X_train)
train_mse_svm = mean_squared_error(y_train, y_train_pred_svm)
train_r_squared_svm = r2_score(y_train, y_train_pred_svm)

# Validation set predictions
y_val_pred_svm = svm_model.predict(X_val)
val_mse_svm = mean_squared_error(y_val, y_val_pred_svm)
val_r_squared_svm = r2_score(y_val, y_val_pred_svm)

# Test set predictions
y_test_pred_svm = svm_model.predict(X_test)
test_mse_svm = mean_squared_error(y_test, y_test_pred_svm)
test_r_squared_svm = r2_score(y_test, y_test_pred_svm)

print("\nSVM - Training Set:")
print(f"Mean Squared Error (MSE): {train_mse_svm}")
print(f"R-squared (R2): {train_r_squared_svm}")
print("\nSVM - Validation Set:")
print(f"Mean Squared Error (MSE): {val_mse_svm}")
print(f"R-squared (R2): {val_r_squared_svm}")
print("\nSVM - Test Set:")
print(f"Mean Squared Error (MSE): {test_mse_svm}")
print(f"R-squared (R2): {test_r_squared_svm}")

# Training Set Accuracy
training_accuracy_svm = train_r_squared_svm * 100
print("SVM - Training Set Accuracy: {:.2f}%".format(training_accuracy_svm))

# Validation Set Accuracy
validation_accuracy_svm = val_r_squared_svm * 100
print("SVM - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_svm))

# Test Set Accuracy
test_accuracy_svm = test_r_squared_svm * 100
print("SVM - Test Set Accuracy: {:.2f}%".format(test_accuracy_svm))


SVM - Mean Squared Error (MSE): 31.253968766107008
SVM - R-squared (R2): 0.32813272811643335

SVM - Training Set:
Mean Squared Error (MSE): 45.73880474111219
R-squared (R2): 0.41032322553332123

SVM - Validation Set:
Mean Squared Error (MSE): 41.969147018360786
R-squared (R2): 0.3868612506866166

SVM - Test Set:
Mean Squared Error (MSE): 31.253968766107008
R-squared (R2): 0.32813272811643335
SVM - Training Set Accuracy: 41.03%
SVM - Validation Set Accuracy: 38.69%
SVM - Test Set Accuracy: 32.81%


# XGBRegressor

In [30]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Create and train the XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Model evaluation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r_squared_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost - Mean Squared Error (MSE): {mse_xgb}")
print(f"XGBoost - R-squared (R2): {r_squared_xgb}")

# Train set predictions
y_train_pred_xgb = xgb_model.predict(X_train)
train_mse_xgb = mean_squared_error(y_train, y_train_pred_xgb)
train_r_squared_xgb = r2_score(y_train, y_train_pred_xgb)

# Validation set predictions
y_val_pred_xgb = xgb_model.predict(X_val)
val_mse_xgb = mean_squared_error(y_val, y_val_pred_xgb)
val_r_squared_xgb = r2_score(y_val, y_val_pred_xgb)

# Test set predictions
y_test_pred_xgb = xgb_model.predict(X_test)
test_mse_xgb = mean_squared_error(y_test, y_test_pred_xgb)
test_r_squared_xgb = r2_score(y_test, y_test_pred_xgb)

print("\nXGBoost - Training Set:")
print(f"Mean Squared Error (MSE): {train_mse_xgb}")
print(f"R-squared (R2): {train_r_squared_xgb}")
print("\nXGBoost - Validation Set:")
print(f"Mean Squared Error (MSE): {val_mse_xgb}")
print(f"R-squared (R2): {val_r_squared_xgb}")
print("\nXGBoost - Test Set:")
print(f"Mean Squared Error (MSE): {test_mse_xgb}")
print(f"R-squared (R2): {test_r_squared_xgb}")

# Training Set Accuracy
training_accuracy_xgb = train_r_squared_xgb * 100
print("XGBoost - Training Set Accuracy: {:.2f}%".format(training_accuracy_xgb))

# Validation Set Accuracy
validation_accuracy_xgb = val_r_squared_xgb * 100
print("XGBoost - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_xgb))

# Test Set Accuracy
test_accuracy_xgb = test_r_squared_xgb * 100
print("XGBoost - Test Set Accuracy: {:.2f}%".format(test_accuracy_xgb))


XGBoost - Mean Squared Error (MSE): 0.3846504829476592
XGBoost - R-squared (R2): 0.9917311598875406

XGBoost - Training Set:
Mean Squared Error (MSE): 5.45455129072436e-07
R-squared (R2): 0.9999999929678481

XGBoost - Validation Set:
Mean Squared Error (MSE): 0.39488207181775026
R-squared (R2): 0.9942310597941223

XGBoost - Test Set:
Mean Squared Error (MSE): 0.3846504829476592
R-squared (R2): 0.9917311598875406
XGBoost - Training Set Accuracy: 100.00%
XGBoost - Validation Set Accuracy: 99.42%
XGBoost - Test Set Accuracy: 99.17%


# Ridge

In [31]:
from sklearn.linear_model import Ridge

# Create and train the Ridge Regression model
ridge_model = Ridge(random_state=42)
ridge_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_ridge = ridge_model.predict(X_test)

# Model evaluation
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r_squared_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression - Mean Squared Error (MSE): {mse_ridge}")
print(f"Ridge Regression - R-squared (R2): {r_squared_ridge}")

# Training Set Accuracy
training_accuracy_ridge = ridge_model.score(X_train, y_train) * 100
print("Ridge Regression - Training Set Accuracy: {:.2f}%".format(training_accuracy_ridge))

# Validation Set Accuracy
validation_accuracy_ridge = ridge_model.score(X_val, y_val) * 100
print("Ridge Regression - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_ridge))

# Test Set Accuracy
test_accuracy_ridge = ridge_model.score(X_test, y_test) * 100
print("Ridge Regression - Test Set Accuracy: {:.2f}%".format(test_accuracy_ridge))


Ridge Regression - Mean Squared Error (MSE): 18.728449491982108
Ridge Regression - R-squared (R2): 0.5973940986197981
Ridge Regression - Training Set Accuracy: 77.30%
Ridge Regression - Validation Set Accuracy: 71.79%
Ridge Regression - Test Set Accuracy: 59.74%


# Lasso

In [None]:
# from sklearn.linear_model import Lasso

# Create and train the Lasso Regression model
lasso_model = Lasso(random_state=42)
lasso_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_lasso = lasso_model.predict(X_test)

# Model evaluation
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r_squared_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression - Mean Squared Error (MSE): {mse_lasso}")
print(f"Lasso Regression - R-squared (R2): {r_squared_lasso}")

# Training Set Accuracy
training_accuracy_lasso = lasso_model.score(X_train, y_train) * 100
print("Lasso Regression - Training Set Accuracy: {:.2f}%".format(training_accuracy_lasso))

# Validation Set Accuracy
validation_accuracy_lasso = lasso_model.score(X_val, y_val) * 100
print("Lasso Regression - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_lasso))

# Test Set Accuracy
test_accuracy_lasso = lasso_model.score(X_test, y_test) * 100
print("Lasso Regression - Test Set Accuracy: {:.2f}%".format(test_accuracy_lasso))


# ElasticNet

In [33]:
from sklearn.linear_model import ElasticNet

# Create and train the ElasticNet Regression model
elasticnet_model = ElasticNet(random_state=42)
elasticnet_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_elasticnet = elasticnet_model.predict(X_test)

# Model evaluation
mse_elasticnet = mean_squared_error(y_test, y_pred_elasticnet)
r_squared_elasticnet = r2_score(y_test, y_pred_elasticnet)

print(f"ElasticNet Regression - Mean Squared Error (MSE): {mse_elasticnet}")
print(f"ElasticNet Regression - R-squared (R2): {r_squared_elasticnet}")

# Training Set Accuracy
training_accuracy_elasticnet = elasticnet_model.score(X_train, y_train) * 100
print("ElasticNet Regression - Training Set Accuracy: {:.2f}%".format(training_accuracy_elasticnet))

# Validation Set Accuracy
validation_accuracy_elasticnet = elasticnet_model.score(X_val, y_val) * 100
print("ElasticNet Regression - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_elasticnet))

# Test Set Accuracy
test_accuracy_elasticnet = elasticnet_model.score(X_test, y_test) * 100
print("ElasticNet Regression - Test Set Accuracy: {:.2f}%".format(test_accuracy_elasticnet))


ElasticNet Regression - Mean Squared Error (MSE): 21.544705688496407
ElasticNet Regression - R-squared (R2): 0.536852975607953
ElasticNet Regression - Training Set Accuracy: 74.75%
ElasticNet Regression - Validation Set Accuracy: 71.15%
ElasticNet Regression - Test Set Accuracy: 53.69%


# KNeighborsRegressor

In [34]:
from sklearn.neighbors import KNeighborsRegressor

# Create and train the KNN Regression model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_knn = knn_model.predict(X_test)

# Model evaluation
mse_knn = mean_squared_error(y_test, y_pred_knn)
r_squared_knn = r2_score(y_test, y_pred_knn)

print(f"KNN Regression - Mean Squared Error (MSE): {mse_knn}")
print(f"KNN Regression - R-squared (R2): {r_squared_knn}")

# Training Set Accuracy
training_accuracy_knn = knn_model.score(X_train, y_train) * 100
print("KNN Regression - Training Set Accuracy: {:.2f}%".format(training_accuracy_knn))

# Validation Set Accuracy
validation_accuracy_knn = knn_model.score(X_val, y_val) * 100
print("KNN Regression - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_knn))

# Test Set Accuracy
test_accuracy_knn = knn_model.score(X_test, y_test) * 100
print("KNN Regression - Test Set Accuracy: {:.2f}%".format(test_accuracy_knn))


KNN Regression - Mean Squared Error (MSE): 26.276243137254898
KNN Regression - R-squared (R2): 0.435139008293834
KNN Regression - Training Set Accuracy: 72.58%
KNN Regression - Validation Set Accuracy: 59.48%
KNN Regression - Test Set Accuracy: 43.51%


# DecisionTreeRegressor

In [35]:
from sklearn.tree import DecisionTreeRegressor

# Create and train the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict popularity on the test set
y_pred_dt = dt_model.predict(X_test)

# Model evaluation
mse_dt = mean_squared_error(y_test, y_pred_dt)
r_squared_dt = r2_score(y_test, y_pred_dt)

print(f"Decision Tree Regression - Mean Squared Error (MSE): {mse_dt}")
print(f"Decision Tree Regression - R-squared (R2): {r_squared_dt}")

# Training Set Accuracy
training_accuracy_dt = dt_model.score(X_train, y_train) * 100
print("Decision Tree Regression - Training Set Accuracy: {:.2f}%".format(training_accuracy_dt))

# Validation Set Accuracy
validation_accuracy_dt = dt_model.score(X_val, y_val) * 100
print("Decision Tree Regression - Validation Set Accuracy: {:.2f}%".format(validation_accuracy_dt))

# Test Set Accuracy
test_accuracy_dt = dt_model.score(X_test, y_test) * 100
print("Decision Tree Regression - Test Set Accuracy: {:.2f}%".format(test_accuracy_dt))


Decision Tree Regression - Mean Squared Error (MSE): 1.5788235294117654
Decision Tree Regression - R-squared (R2): 0.9660599949584069
Decision Tree Regression - Training Set Accuracy: 100.00%
Decision Tree Regression - Validation Set Accuracy: 94.46%
Decision Tree Regression - Test Set Accuracy: 96.61%
