# Regression (Linear)

Credit: Kalvin

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
plt.style.use(style='fivethirtyeight')
plt.rcParams['figure.figsize'] = (10, 6)

# Data Preparation

In [None]:
data = pd.read_csv("Calories.csv")
data

In [None]:
data_out = data.pop("Calories")
data["Calories"] = data_out

In [None]:
data = data.drop(columns = ["User_ID"])

In [None]:
data.columns

#

#EDA

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
for column in data.columns[:-1] :
  sns.histplot(data = data, x = column)
  plt.show()

Peserta paling banyak yaitu umur 20 - 30 an.
Tinggi dan berat badan memiliki distribusi yang baik.
Body_Temp paling tinggi ada di 40 - 41.

In [None]:
for column in data.columns[:-1] :
  sns.boxplot(data = data, x = column)
  plt.show()

Tidak ada outlier ekstrim, hanya outlier kecil.


In [None]:
for column in data.columns[:-1] :
  sns.scatterplot(data = data, x = column, y = "Calories")
  plt.title(column)
  plt.xlabel(column)
  plt.ylabel("Count")
  plt.show()

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtypes != "object"):
        plt.scatter(x=data[column] , y=data["Calories"])
        plt.show()

Bisa dilihat, semakin lama durasi olahraganya, maka kalori yang terbakar juga semakin banyak.
Semakin tinggin heart_rate-nya, maka kalori yang terbakar juga semakin banyak.
Semakin naik temperatur tubuh, maka kalori yang dibakar semakin banyak.


In [None]:
for column in data.columns[:-1] :
  if(data[column].dtype != "object") :
    print(f"column : {column}, skew : {data[column].skew()}, kurt : {data[column].kurt()}")

Kolom terdistribusi normal : tidak ada.
kolom tidak terdistribusi normal : semua
(Berarti, untuk menangani missing value-nya menggunakan .median() )

# Feature Engineering

In [None]:
missing_percentage = data.isna().sum() / len(data) * 100
missing_percentage

In [None]:
for key, value in missing_percentage.items() :
  if(value > 50.0) :
    data = data.drop(columns = [key])

In [None]:
for column in data.columns :
  if(data[column].isna().any()) :
    if(data[column].dtype != "object") :
      data[column] = data[column].fillna(data[column].median())
    else :
      data[column] = data[column].fillna(data[column].mode()[0])

In [None]:
data.isna().sum()

In [None]:
data.columns

In [None]:
encoder = LabelEncoder()
encoded = {}
for column in data.columns :
  if(data[column].dtype == "object") :
    data[column] = encoder.fit_transform(data[column])
    encoded[column] = {i : class_name for i, class_name in enumerate(encoder.classes_)}

In [None]:
for key, value in encoded.items() : print(f"key : {key}, value : {value}")

In [None]:
x = data[data.columns[:-1]]
y = data[data.columns[-1]]
print (x)
print (y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
def correlation(data_df, threshold) :
  corr_col = set()
  corr_matrix = data_df.corr()
  for i in range(len(corr_matrix.columns)) :
    for j in range(i + 1, len(corr_matrix.columns)) :
      if(np.abs(corr_matrix.iloc[(i, j)]) > threshold) :
        corr_col.add(corr_matrix.columns[j])
  return corr_col

In [None]:
corr_col = correlation(x_train, 0.9)
print(corr_col)

In [None]:
x_train = x_train.drop(columns = corr_col, axis = 1)
x_test = x_test.drop(columns = corr_col, axis = 1)

In [None]:
# x_train = x_train.to_numpy()
# x_test = x_test.to_numpy()

In [None]:
model = LinearRegression() #LinearRegression dengan MinMaxScaler()
history = model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(f"RMSE : {root_mean_squared_error(y_test, y_pred)}, MSE : {mean_squared_error(y_test, y_pred)}, R2_SCORE : {r2_score(y_test, y_pred)}")

In [None]:
y_train = np.reshape(y_train, (-1, 1))
y_test = np.reshape(y_test, (-1, 1))

In [None]:
scaler_x_minmax, scaler_y_minmax = MinMaxScaler(), MinMaxScaler()
scaler_x_standard, scaler_y_standard = StandardScaler(), StandardScaler()

In [None]:
def scaling(x1, x2, y1, y2, scaler_x, scaler_y) :
  x1 = scaler_x.fit_transform(x1)
  x2 = scaler_x.transform(x2)
  y1 = scaler_y.fit_transform(y1)
  y2 = scaler_y.transform(y2)
  return x1, x2, y1, y2, scaler_x, scaler_y

In [None]:
x_train1, x_test1, y_train1, y_test1, scaler_x_minmax, scaler_y_minmax = scaling(x_train, x_test, y_train, y_test, scaler_x_minmax, scaler_y_minmax)

In [None]:
x_train2, x_test2, y_train2, y_test2, scaler_x_standard, scaler_y_standard = scaling(x_train, x_test, y_train, y_test, scaler_x_standard, scaler_y_standard)

In [None]:
model = LinearRegression() #LinearRegression dengan MinMaxScaler()
history = model.fit(x_train1, y_train1)
y_pred = model.predict(x_test1)
print(f"RMSE : {root_mean_squared_error(y_test1, y_pred)}, MSE : {mean_squared_error(y_test1, y_pred)}, R2_SCORE : {r2_score(y_test1, y_pred)}")

In [None]:
model2 = LinearRegression() #LinearRegression dengan StandardScaler()
history2 = model2.fit(x_train2, y_train2)
y_pred2 = model2.predict(x_test2)
print(f"RMSE : {root_mean_squared_error(y_test2, y_pred2)}, MSE : {mean_squared_error(y_test2, y_pred2)}, R2_SCORE : {r2_score(y_test2, y_pred2)}")

In [None]:
print (x_test1)
print (y_pred)

print (x_test2)
print (y_pred2)