In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing function to split sets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate

# Importing regressors
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Importing metric
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import auc, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# Importing Grid Search
from sklearn.model_selection import GridSearchCV


REGRESSION TASK: Teach with synthetically generated data and test on real data


In [115]:
# Loading dataset
df = pd.read_csv('/content/drive/MyDrive/sample_data/EstimationOfObesityLevels.csv')

In [116]:
# Creating set with only synthetical data
df_synth = df[~df['FAF'].isin([0, 1, 2, 3])]
df_synth = df_synth[df_synth['FCVC'].isin([1, 2, 3])]

# Clearing set with only real data
df = df[df['FAF'].isin([0, 1, 2, 3])]
df = df[df['FCVC'].isin([1, 2, 3])]

df_synth.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
498,Female,25.196214,1.686306,104.572712,yes,yes,3.0,3.0,Sometimes,no,1.152736,no,0.319156,1.0,Sometimes,Public_Transportation,Obesity_Type_III
499,Female,18.503343,1.683124,126.67378,yes,yes,3.0,3.0,Sometimes,no,1.115967,no,1.541072,1.0,Sometimes,Public_Transportation,Obesity_Type_III
501,Female,21.853826,1.755643,137.796884,yes,yes,3.0,3.0,Sometimes,no,2.184707,no,1.978631,0.838957,Sometimes,Public_Transportation,Obesity_Type_III
502,Female,21.90012,1.843419,165.057269,yes,yes,3.0,3.0,Sometimes,no,2.406541,no,0.10032,0.479221,Sometimes,Public_Transportation,Obesity_Type_III
503,Female,18.306615,1.7456,133.03441,yes,yes,3.0,3.0,Sometimes,no,2.984323,no,1.586525,0.62535,Sometimes,Public_Transportation,Obesity_Type_III


In [117]:
# Importing OriginalEncoder and OneHotEncoder to manage categorical and binary variables
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Identify categorical features
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']

# Using OrdinalEncoder for both datasets:
ordinal_encoder = OrdinalEncoder()
df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])
df_synth[categorical_features] = ordinal_encoder.fit_transform(df_synth[categorical_features])

df_synth.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
498,0.0,25.196214,1.686306,104.572712,1.0,1.0,3.0,3.0,1.0,0.0,1.152736,0.0,0.319156,1.0,1.0,1.0,3.0
499,0.0,18.503343,1.683124,126.67378,1.0,1.0,3.0,3.0,1.0,0.0,1.115967,0.0,1.541072,1.0,1.0,1.0,3.0
501,0.0,21.853826,1.755643,137.796884,1.0,1.0,3.0,3.0,1.0,0.0,2.184707,0.0,1.978631,0.838957,1.0,1.0,3.0
502,0.0,21.90012,1.843419,165.057269,1.0,1.0,3.0,3.0,1.0,0.0,2.406541,0.0,0.10032,0.479221,1.0,1.0,3.0
503,0.0,18.306615,1.7456,133.03441,1.0,1.0,3.0,3.0,1.0,0.0,2.984323,0.0,1.586525,0.62535,1.0,1.0,3.0


In [118]:
# PREPROCESSING

# Creating two sets with synthetical data
X_synth = df_synth.drop(columns=['FAF', 'Gender', 'Age', 'Height', 'family_history_with_overweight', 'NCP', 'TUE', 'MTRANS', 'NObeyesdad'], axis = 1)
y_synth = df_synth['FAF']

# Splitting synthetical data into train and validation set
X_train_synth, X_val_synth, y_train_synth, y_val_synth = train_test_split(X_synth, y_synth,
                                                                          test_size=0.8, random_state=42)

# Creating test set with real data
X = df.drop(columns=['FAF', 'Gender', 'Age', 'Height', 'family_history_with_overweight', 'NCP', 'TUE', 'MTRANS', 'NObeyesdad'], axis = 1)
y = df['FAF']

In [119]:
# Importing MLP
from sklearn.neural_network import MLPRegressor

# Creating model
reg = MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=1000, solver='lbfgs')

# Training model
reg.fit(X_train_synth, y_train_synth)
reg.fit(X_val_synth, y_val_synth)

# Making train and validation predictions
y_pred_train = reg.predict(X_train_synth)
y_pred_val = reg.predict(X_val_synth)

# Calculating RMSE for testing and validation data
rmse_mlp_train = root_mean_squared_error(y_train_synth, y_pred_train)
rmse_mlp_val = root_mean_squared_error(y_val_synth, y_pred_val)
print(f'RMSE train with synthetical data: {rmse_mlp_train}')
print(f'\nRMSE validation with synthetical data: {rmse_mlp_val}')

# Making test predictions
y_pred_test = reg.predict(X)

# Calculating RMSE for test data
rmse_mlp_test = root_mean_squared_error(y, y_pred_test)
print(f'\nRMSE test with real data: {rmse_mlp_test}')

RMSE train with synthetical data: 0.7039924069799015

RMSE validation with synthetical data: 0.6585831336717046

RMSE test with real data: 1.105041998330938


REGRESSION TASK: MLP with chosen features VS MLP with all features


In [120]:
# Importing data
data = pd.read_csv('/content/drive/MyDrive/sample_data/EstimationOfObesityLevels.csv')
data.head(10)


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
6,Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
7,Male,22.0,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24.0,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
9,Male,22.0,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight


In [121]:
"""
goal: Estimate how often do you have physical activity (FAF)

features: FAVC, FCVC, CAEC, CH2O, SCC, CALC, SMOKE, Weight
"""

'\ngoal: Estimate how often do you have physical activity (FAF)\n\nfeatures: FAVC, FCVC, CAEC, CH2O, SCC, CALC, SMOKE, Weight\n'

In [122]:
# Preprocessing
# Getting rid of synthetically generated data
data = data[data['FAF'].isin([0, 1, 2, 3])]
data = data[data['FCVC'].isin([1, 2, 3])]

# Importing OriginalEncoder and OneHotEncoder to manage categorical and binary variables
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Identify categorical features
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']

# Using OrdinalEncoder:
ordinal_encoder = OrdinalEncoder()
data[categorical_features] = ordinal_encoder.fit_transform(data[categorical_features])


In [123]:
# Categorical data convertion
# Write it in raport, linear progression of these converted data
"""
Gender: {'Female' : 0, 'Male' : 1}
family_history_with_overweight: {'no' : 0, 'yes' : 1}
FAVC: {'no' : 0, 'yes' : 1}
FCVC: {'never' : 0, 'sometimes' : 1, 'always' : 2}
CAEC: {'always' : 0, 'frequently' : 1, 'sometimes' : 2, 'no' : 3}
SMOKE: {'no' : 0, 'yes' : 1}
SCC: {'no' : 0, 'yes' : 1}
CALC: {'always' : 0, 'frequently' : 1, 'sometimes' : 2, 'no' : 3}
MTRANS: {'Public_Transportation' : 0, 'Walking' : 1, 'Automobile' : 2, 'Motorbike' : 3, 'Bike' : 4}
NObeyesdad: {'Insufficient_Weight' : 0, 'Normal_Weight' : 1, 'Obesity_Type_I' : 2, 'Obesity_Type_II' : 3,
             'Obesity_Type_III' : 4, 'Overweight_Level_I' : 5, 'Overweight_Level_II' : 6}
"""

"\nGender: {'Female' : 0, 'Male' : 1}\nfamily_history_with_overweight: {'no' : 0, 'yes' : 1}\nFAVC: {'no' : 0, 'yes' : 1}\nFCVC: {'never' : 0, 'sometimes' : 1, 'always' : 2}\nCAEC: {'always' : 0, 'frequently' : 1, 'sometimes' : 2, 'no' : 3}\nSMOKE: {'no' : 0, 'yes' : 1}\nSCC: {'no' : 0, 'yes' : 1}\nCALC: {'always' : 0, 'frequently' : 1, 'sometimes' : 2, 'no' : 3}\nMTRANS: {'Public_Transportation' : 0, 'Walking' : 1, 'Automobile' : 2, 'Motorbike' : 3, 'Bike' : 4}\nNObeyesdad: {'Insufficient_Weight' : 0, 'Normal_Weight' : 1, 'Obesity_Type_I' : 2, 'Obesity_Type_II' : 3,\n             'Obesity_Type_III' : 4, 'Overweight_Level_I' : 5, 'Overweight_Level_II' : 6}\n"

In [124]:
# PREPROCESSING

# MLP with all values
# Creating two sets
X_all = data.drop(columns=['FAF'], axis = 1)
y_all = data['FAF']

# Splitting dataset into train, validation and test sets
X_train_all, X_temp_all, y_train_all, y_temp_all = train_test_split(X_all, y_all, test_size=0.4, random_state=42)
X_val_all, X_test_all, y_val_all, y_test_all = train_test_split(X_temp_all, y_temp_all, test_size=0.5, random_state=42)


# MLP with selected values
# Creating two sets
X = data.drop(columns=['FAF', 'Gender', 'Age', 'Height', 'family_history_with_overweight', 'NCP', 'TUE', 'MTRANS', 'NObeyesdad'], axis = 1)
y = data['FAF']

# Splitting dataset into train, validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [125]:
# Importing MLP
from sklearn.neural_network import MLPRegressor

# Creating models
reg_all = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')
reg_mlp = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')

# Training models
reg_all.fit(X_train_all, y_train_all)
reg_mlp.fit(X_train, y_train)

# Making predictions
y_pred_train_all = reg_all.predict(X_train_all)
y_pred_all = reg_all.predict(X_val_all)

y_pred_train = reg_mlp.predict(X_train)
y_pred = reg_mlp.predict(X_val)

# Calculating RMSE for training set
rmse_mlp_all_train = root_mean_squared_error(y_train_all, y_pred_train_all)
rmse_mlp_train = root_mean_squared_error(y_train, y_pred_train)
print(f'RMSE train with all features: {rmse_mlp_all_train}')
print(f'\nRMSE train with selected features: {rmse_mlp_train}')

# Calculating RMSE for validation set
rmse_mlp_all = root_mean_squared_error(y_val_all, y_pred_all)
rmse_mlp = root_mean_squared_error(y_val, y_pred)
print(f'\nRMSE with all features: {rmse_mlp_all}')
print(f'\nRMSE with selected features: {rmse_mlp}')


RMSE train with all features: 0.8051454936102976

RMSE train with selected features: 0.9774950904173484

RMSE with all features: 0.9050638751386738

RMSE with selected features: 1.016344460756235


REGRESSION TASK: Comparing to MLP with selected values

*   Linear (and weighted linear) regresor
*   SVM (different kernels)
*   Random forest regressor

In [317]:
def predict(model_name, X_train, X_val, y_train, y_val, X_test=None, y_test=None, sample_weight=None, predict_on_test_set=False, calc_rmse=False, calc_r2=False, calc_mae=False, **kwargs):

  # Creating model
  model = model_name(**kwargs)

  if sample_weight is not None:
    # Training model
    model.fit(X_train, y_train, sample_weight=sample_weight)
    model.fit(X_val, y_val, sample_weight=sample_weight)
  else:
    # Training model
    model.fit(X_train, y_train)
    model.fit(X_val, y_val)

  # Making train and validation predictions
  y_pred_train = model.predict(X_train)
  y_pred_val = model.predict(X_val)

  if predict_on_test_set == True:
    # Retraining model
    model.fit(X_train, y_train)
    model.fit(X_val, y_val)

    # Making test predictions
    y_pred = model.predict(X_test)

  print(f"Calculations for {model}")

  # Calculating metrics
  if calc_rmse == True:
    rmse_train = root_mean_squared_error(y_train, y_pred_train)
    rmse_validation = root_mean_squared_error(y_val, y_pred_val)
    print(f'\nRMSE train: {rmse_train}')
    print(f'RMSE validation: {rmse_validation}')
    if predict_on_test_set == True:
      rmse_test = root_mean_squared_error(y_test, y_pred)
      print(f'RMSE test: {rmse_test}')

  if calc_r2 == True:
    r2_train = r2_score(y_train, y_pred_train)
    r2_validation = r2_score(y_val, y_pred_val)
    print(f'\nR2 train: {r2_train}')
    print(f'R2 validation: {r2_validation}')
    if predict_on_test_set == True:
      r2_test = r2_score(y_test, y_test)
      print(f'R2 test: {r2_test}')

  if calc_mae == True:
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_validation = mean_absolute_error(y_val, y_pred_val)
    print(f'\nMAE train: {mae_train}')
    print(f'MAE validation: {mae_validation}')
    if predict_on_test_set == True:
      mae_test = mean_absolute_error(y_test, y_pred)
      print(f'MAE test: {mae_test}')

  return model

In [305]:
# Linear regressor
lr = predict(model_name=LinearRegression,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   calc_rmse=True,
                   )

Calculations for LinearRegression()

RMSE train: 0.9298824298292969
RMSE validation: 0.9297430117074393


# CO JEST KURWA


In [318]:
# Weighted linear regressor

# Creating sample_weight
sample_weight = np.ones(len(y_train)) * 20
sample_weight[-2:] *= 30

# Creating model and predicting
lr_weighted = predict(model_name=LinearRegression,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   sample_weight=sample_weight,
                   calc_rmse=True,
                   )


ValueError: sample_weight.shape == (448,), expected (150,)!

In [307]:
# SVM with linear kernel
svm_linear = predict(model_name=SVR,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   calc_rmse=True,
                   kernel='linear')


Calculations for SVR(kernel='linear')

RMSE train: 0.9886586607684037
RMSE validation: 0.995411666176261


In [308]:
# SVM with rbf kernel
svm_rbf = predict(model_name=SVR,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   calc_rmse=True,
                   kernel='rbf')


Calculations for SVR()

RMSE train: 0.9432871293019063
RMSE validation: 0.9973582573077979


In [309]:
# SVM with poly kernel
svm_poly = predict(model_name=SVR,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   calc_rmse=True,
                   kernel='poly')


Calculations for SVR(kernel='poly')

RMSE train: 0.9732016172604416
RMSE validation: 1.0136818420570017


In [310]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   calc_rmse=True,
                   random_state=42,
                   n_estimators=100)

Calculations for RandomForestRegressor(random_state=42)

RMSE train: 0.9503945698264773
RMSE validation: 0.3526064806341273


Task 7.1 and 7.2

In [275]:
# Function to print RMSE
def print_scores(scores):
  print(f"Mean: {np.abs(np.mean(scores))}")

# Function to cross-validation
def cross_validation(model, X, y, cv=5, scoring='neg_root_mean_squared_error'):
  scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
  return scores


In [276]:
# regressor part
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(reg_mlp, X_train, y_train, cv=iter, scoring='neg_root_mean_squared_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.951829988989512

Regression scores for 5-fold:
Mean: 0.960207868728477

Regression scores for 10-fold:
Mean: 0.9485737803029316


Task 7.3

In [277]:
# R2 metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(reg_mlp, X_train, y_train, cv=iter, scoring='r2')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.06701700730346194

Regression scores for 5-fold:
Mean: 0.0681773135396921

Regression scores for 10-fold:
Mean: 0.0475893437951184


In [278]:
# Mean absolute error metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(reg_mlp, X_train, y_train, cv=iter, scoring='neg_mean_absolute_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.8036903765448967

Regression scores for 5-fold:
Mean: 0.7779166311490752

Regression scores for 10-fold:
Mean: 0.7652457101972072


Task 7.4

In [136]:
reg = MLPRegressor()
parameters = {'hidden_layer_sizes': [(5, 5), (5, 10), (10, 10)],
              'activation': ['relu', 'tanh'],
              'max_iter': [3000, 4000],
              'solver': [ 'lbfgs', 'sgd', 'adam'],
              }

grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_) # to get the best parameters


{'activation': 'relu', 'hidden_layer_sizes': (10, 10), 'max_iter': 3000, 'solver': 'lbfgs'}


Task 7.5

In [279]:
# Training model with the tuned hypermarameters
reg = predict(model_name=MLPRegressor,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   X_test=X_test,
                   y_test=y_test,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   activation='relu',
                   hidden_layer_sizes=(10, 10),
                   max_iter=3000,
                   solver='lbfgs'
                   )


Calculations for MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=3000, solver='lbfgs')
RMSE train: 0.9298501040300226
RMSE validation: 1.1533733731517104
RMSE test: 1.0797236389024902


Task 7.8

In [280]:
# SVM with linear kernel
svm_linear = predict(model_name=SVR,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   calc_rmse=True,
                   kernel='linear')


Calculations for SVR(kernel='linear')
RMSE train: 0.9886586607684037
RMSE validation: 0.995411666176261


In [281]:
# regressor part
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(svm_linear, X_train, y_train, cv=iter, scoring='neg_root_mean_squared_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.9335681557339456

Regression scores for 5-fold:
Mean: 0.9331503627135043

Regression scores for 10-fold:
Mean: 0.9300462245948189


In [282]:
# R2 metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(svm_linear, X_train, y_train, cv=iter, scoring='r2')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.07984500188710957

Regression scores for 5-fold:
Mean: 0.07088279298938287

Regression scores for 10-fold:
Mean: 0.06275061719255473


In [283]:
# Mean absolute error metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(svm_linear, X_train, y_train, cv=iter, scoring='neg_mean_absolute_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.7152584982399744

Regression scores for 5-fold:
Mean: 0.7220087652392768

Regression scores for 10-fold:
Mean: 0.7219217683842056


In [219]:
reg = SVR()
parameters = [
              {'kernel':('linear', 'rbf',), 'tol':[1e-1, 1e-2, 1e-3, 1e-4], 'C':[0.01, 0.1, 0.5, 1, 1.5, 10]},
              {'kernel':['poly'], 'tol':[1e-1, 1e-2, 1e-3, 1e-4], 'C':[0.01, 0.1, 0.5, 1, 1.5, 10], 'degree':[2,3,4]}
              ]

grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_) # to get the best parameters


{'C': 0.5, 'kernel': 'linear', 'tol': 0.1}


In [284]:
reg = predict(model_name=SVR,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   X_test=X_test,
                   y_test=y_test,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   kernel='linear',
                   tol=0.01,
                   C=0.5
                   )

Calculations for SVR(C=0.5, kernel='linear', tol=0.01)
RMSE train: 0.9653258383240502
RMSE validation: 1.0826251054750125
RMSE test: 1.075860003477112


In [285]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   calc_rmse=True,
                   random_state=42,
                   n_estimators=100)


Calculations for RandomForestRegressor(random_state=42)
RMSE train: 0.9503945698264773
RMSE validation: 0.3526064806341273


In [286]:
# regressor part
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(rfr, X_train, y_train, cv=iter, scoring='neg_root_mean_squared_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)


Regression scores for 3-fold:
Mean: 0.9211030333884676

Regression scores for 5-fold:
Mean: 0.8846540902118856

Regression scores for 10-fold:
Mean: 0.8634372085306932


In [287]:
reg = RandomForestRegressor()
parameters = {'n_estimators':[10, 100, 1000, 2000],
              'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
              }

grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_) # to get the best parameters

{'criterion': 'absolute_error', 'n_estimators': 100}


In [288]:
# Creating model
reg = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=42)

# Training model
reg.fit(X_train, y_train)
reg.fit(X_val, y_val)

# Making train and validation predictions
y_pred_train = reg.predict(X_train)
y_pred_val = reg.predict(X_val)
reg.fit(X_train, y_train)

# Predicting value on test set
y_pred = reg.predict(X_test)

# RMSE, R2, MAE score
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)


print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(f"MAE: {mae}")

RMSE: 0.9196432279241047
R2: 0.22595051659616006
MAE: 0.6838


In [312]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train,
                   X_val=X_val,
                   y_train=y_train,
                   y_val=y_val,
                   X_test=X_test,
                   y_test=y_test,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   random_state=42,
                   n_estimators=100,
                   criterion='absolute_error')

Calculations for RandomForestRegressor(criterion='absolute_error', random_state=42)

RMSE train: 0.9556753642245437
RMSE validation: 0.373738322002619
RMSE test: 1.0230912308619728

R2 train: 0.04414588991093593
R2 validation: 0.8639241643574644
R2 test: 1.0

MAE train: 0.72453125
MAE validation: 0.2916666666666667
MAE test: 0.7886000000000001
