In [None]:
# 1. Import the necessary libraries.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# 2. Load the `customer_value_analysis.csv` into the variable `customer_df`.
customer_df = pd.read_csv('files_for_lab\customer_value_analysis.csv')

In [None]:
# 3. First look at its main features (`head`, `shape`, `info`).
display(customer_df.head())
display(customer_df.shape)
display(customer_df.info())

In [None]:
# 4. Rename the columns so they follow the _PE8_ (snake case: lowecase_with_underscores).
def col_rename (dframe: pd.DataFrame) -> pd.DataFrame:
    """
    This function renames column names by removing spaces and converting to lower case
    Inputs: dframe of type pandas dataframe
    Outputs: returns the dataframe with the renamed columns
    """
    cols =[]
    for x in dframe.columns:
        if isinstance(x, str):
            cols.append(x.lower().replace(' ', '_'))
        else:
            cols.append(x)
    if 'st' in cols:
        index = cols.index('st')   
        cols[index]='state'

    dframe.columns=cols
    return dframe

col_rename(customer_df)
customer_df.rename(columns={'employmentstatus': 'employment_status'}, inplace=True)

customer_df

In [None]:
# 5. Change the type of `effective_to_date` column to DateTime format.
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])
customer_df.info()

In [None]:
# 6. Check `NaN` values per column.
customer_df.isna().sum() #there are no NaN values in the dataframe

In [None]:
# 7. Define a function that given an input dataframe, returns two dataframes: one with numerical columns and another with categorical columns of the input dataframe.
def split_df(df: pd.DataFrame):
    num_df = df.select_dtypes(include='number')
    cat_df = df.select_dtypes(include='object')
    
    return num_df, cat_df

#calling the function
numerical_df, categorical_df = split_df(customer_df)
display(numerical_df)
display(categorical_df)

In [None]:
# 8. Drop any ID column.
categorical_df.drop('customer', axis=1, inplace=True)
display(categorical_df)

In [None]:
# 9. Get the correlation matrix for the numerical variables. What is the pair of numerical variables that have the highest correlation? It makes sense, why?
correlation_matrix = numerical_df.corr()
display(correlation_matrix)

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

# The variables with the highest correlation are the monthly_premium_auto and the total_claim_amount
# This makes sense because the higher the amount a customer pays for their auto insurance, the more likely the customer is to apply for claims 

In [None]:
# 10. Define a function that takes a pandas DataFrame as an input and returns two pandas DataFrames: the first containing numerical continuous columns and the second containing numerical discrete columns of the input dataframe. To this end, it might be helpful to count the number of unique values. The function **must have an optional argument set by default to 36** to discriminate between continuous and discrete columns. Then, use it to create two new dataframes: continuous_df and discrete_df. 
def split_continuous_discrete(df: pd.DataFrame, threshhold=36):
    continuous_cols=[]
    discrete_cols=[]
    for col in df.columns:
        print(col, df[col].nunique())
        if df[col].nunique()>threshhold:
            continuous_cols.append(col)
        else:
            discrete_cols.append(col)
    
    cont_df = df[continuous_cols]
    disc_df = df[discrete_cols]
    
    return cont_df, disc_df

continuous_df, discrete_df= split_continuous_discrete(numerical_df)
display(continuous_df)
display(discrete_df)

In [None]:
# 11. Create a function to create a barplot for all the columns of the discrete_df using seaborn, and set the figuresize = (16,16). 
def plot_bar(df: pd.DataFrame, figsize=(16, 16)):
    for col in df.columns:
        plt.figure(figsize=figsize)
        sns.countplot(x=col, data=df)
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.title(f'Bar Plot for {col}')
        plt.show()
        
plot_bar(discrete_df)

In [None]:
# 12. Create a function to create a histogram for all the columns of the continuous_df using seaborn, and set the figuresize = (16,16)
def plot_hist(df: pd.DataFrame, figsize=(16, 16)):
    for col in df.columns:
        plt.figure(figsize=figsize)
        sns.histplot(df[col], bins='auto')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.title(f'Histogram for {col}')
        plt.show()
        
plot_hist(continuous_df)

In [None]:
# 13. According to the previous histogram plots, do you think that you will have to apply any transformation?
# Yes, as the histograms are skewed.

In [None]:
# 14. Look for outliers in the continuous variables that you have found. Hint: There was a good plot to do that. Define a function to create this kind of plot for the continuous_df.
def plot_box(df: pd.DataFrame, figsize=(8,4)):
    for col in df.columns:
        plt.figure(figsize=figsize)
        sns.boxplot(x=df[col])
        plt.xlabel(col)
        plt.ylabel('Values')
        plt.title(f'Box Plot for Outliers of {col}')
        plt.show()
        
plot_box(continuous_df)

# Lab Cleaning Categorical Data

1) Define a function that given a pandas DataFrame as input creates a **seaborn countplot** of each categorical column. Make sure to sort the bars by frequency ie: the most frequent values should be placed first. Hint: use .value_counts(). In addition, if the amount of unique values of a categorical column (cardinality) is six or more, the corresponding countplot should have the bars placed on the y-axis instead of the x-axis.

In [None]:
def create_countplot (df: pd.DataFrame, figsize=(8,4)):
    #df = df.select_dtypes(include='object') #Only if we provide the original dataframe and not the categorical one
    #df.drop('customer', axis=1, inplace=True)# Only if we provide the original dataframe and not the categorical one

    for col in df.columns:
        plt.figure(figsize=figsize)
        value_counts = df[col].value_counts()
        sorted_values = value_counts.index
        if df[col].nunique()>=6:
            sns.countplot(y=col, data=df, order=sorted_values)
        else:
            sns.countplot(x=col, data=df, order=sorted_values)
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.title(f'Count Plot for {col}')
        plt.show()
                     
create_countplot(categorical_df)

2. `policy_type` and `policy` columns are redundant, and what's worse `policy` column has a lot of possible unique values (high cardinality) which will be problematic when they will be dummified with an OneHotEncoder because we will increase a lot the number of columns in the dataframe. Drop the column `policy_type` and transform the column `policy` to three possible values: L1, L2, and L3 using a function.

In [None]:
categorical_df.drop('policy_type', axis=1, inplace=True)

def clean_policy(df: pd.DataFrame) -> pd.DataFrame:
    df['policy'] = df['policy'].replace({r'.*L1.*': 'L1', r'.*L2.*': 'L2', r'.*L3.*': 'L3'}, regex=True)
    return df

In [None]:
categorical_df=clean_policy(categorical_df)
display(categorical_df.head())
categorical_df['policy'].value_counts(dropna=False)

3. Time dependency analysis. Use a seaborn line plot using the column `effective_to_date` to see if `total_claim_amount` is bigger at some specific dates. Use a figsize=(10,10)

In [None]:
customer_df = customer_df.sort_values(by='effective_to_date')

plt.figure(figsize=(10, 10))
sns.lineplot(x='effective_to_date', y='total_claim_amount', data=customer_df)
plt.xticks(rotation=45)
plt.title('Time Dependency Analysis - Total Claim Amount against Effective to Date')
plt.xlabel('Effective To Date')
plt.ylabel('Total Claim Amount')
plt.show()

4. To continue the analysis define an empty pandas DataFrame, and add the following new columns:
* `day` with the day number of `effective_to_date`
* `day_name` with the day NAME of `effective_to_date`
* `week` with the week of `effective_to_date`
* `month` with the month NAME of `effective_to_date`
* `total_claim_amount` with `total_claim_amount`

In [None]:
new_df= pd.DataFrame()
new_df['day']= customer_df['effective_to_date'].dt.day
new_df['day_name']= customer_df['effective_to_date'].dt.day_name()
new_df['week']= customer_df['effective_to_date'].dt.isocalendar().week
new_df['month']= customer_df['effective_to_date'].dt.month_name()
new_df['total_claim_amount'] = customer_df['total_claim_amount'].copy()
display(new_df)

5. Compute the total `target` column aggregated `day_name` rounded to two decimals and then reorder the index of the resulting pandas series using `.reindex(index=list_of_correct_days)`

In [None]:
target_by_day = pd.pivot_table(new_df, values='total_claim_amount', index='day_name', aggfunc='sum').round(2)
list_of_correct_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
target_by_day= target_by_day.reindex(index=list_of_correct_days)
target_by_day.columns=['target']
target_by_day= target_by_day.reset_index()
display(target_by_day)

In [None]:
# 6. Use a seaborn line plot to plot the previous series. Do you see some differences by day of the week?
plt.figure(figsize=(10, 10))
sns.lineplot(x='day_name', y='target', data=target_by_day)
plt.title('Total Claim Amount  by Day Name')
plt.xlabel('Day of the Week')
plt.ylabel('Target')
plt.show()
# I can see that most claims are made on Monday, the first day of the week
# This is followed by the weekend

7. Get the total number of claims by day of the week name and then reorder the index of the resulting pandas series using `.reindex(index=list_of_correct_values)`

In [None]:
# seems to me the same requirement as the previous question.

In [None]:
# 8. Get the median "target" by day of the week name and then sort the resulting values in descending order using .sort_values()
median_by_day = pd.pivot_table(new_df, values='total_claim_amount', index='day_name', aggfunc='median').round(2)
median_by_day= median_by_day.sort_values(by= 'total_claim_amount', ascending= False)
median_by_day.columns=['median_target']
median_by_day= median_by_day.reset_index()
display(median_by_day)

In [None]:
# 9. Plot the median "target" by day of the week name using a seaborn barplot
plt.figure(figsize=(8,6))
sns.barplot(x='day_name', y='median_target', data=median_by_day, palette='viridis')
plt.xlabel('Day of the Week')
plt.ylabel('Median Target')
plt.title('Median Target by day of the Week')
plt.show()

In [None]:
# 10. What can you conclude from this analysis?
# The median value is simimlar for all the days of the week

In [None]:
# 11. Compute the total `target` column aggregated `month` rounded to two decimals and then reorder the index of the resulting pandas series using .reindex(index=list_of_correct_values)
new_df['month'].value_counts(dropna = False) #we only have January and February
target_by_month = pd.pivot_table(new_df, values='total_claim_amount', index='month', aggfunc='sum').round(2)
list_of_correct_months = ['January', 'February']
target_by_month= target_by_month.reindex(index=list_of_correct_months)
target_by_month.columns=['target']
target_by_month= target_by_month.reset_index()
display(target_by_month)

In [None]:
# 12. Can you do a monthly analysis given the output of the previous series? Why?
# No,as the output is only showing the total target, and does not give any insights to the data

In [None]:
# 13. Define a function to remove the outliers of a numerical continuous column depending if a value is bigger or smaller than a given amount of standard deviations of the mean (thr=3)
def remove_outliers (df: pd.DataFrame, thr=3) ->pd.DataFrame:
    for col in df.columns:
        std_dev= df[col].std()
        mean_value = df[col].mean()
        filter_condition = (df[col] >= mean_value - thr * std_dev) & (df[col] <= mean_value + thr * std_dev)
        df = df[filter_condition]
    return df


In [None]:
# 14. Use the previous function to remove the outliers of continuous data and to generate a continuous_clean_df.
continuous_clean_df=continuous_df.copy()
continuous_clean_df=remove_outliers(continuous_clean_df)
display(continuous_clean_df)

In [None]:
# 15. Concatenate the `continuous_cleaned_df`, `discrete_df`, `categorical_df`, and the relevant column of `time_df`. After removing outliers the continuous_cleaned dataframe will have fewer rows (when you concat the individual dataframes using `pd.concat()`) the resulting dataframe will have NaN's because of the different sizes of each dataframe. Use `pd.dropna()` and `.reset_index()` to fix the final dataframe.
time_df= new_df.drop(['day_name', 'week', 'total_claim_amount'], axis=1)
concatenated_df = pd.concat([categorical_df, discrete_df, continuous_clean_df, time_df], axis=1)
#display(concatenated_df)
#print(concatenated_df.isna().sum())
concatenated_df.dropna(inplace=True)
#display(concatenated_df)
print(concatenated_df.isna().sum())
concatenated_df.reset_index()

In [None]:
# 16. Reorder the columns of the dataframe to place 'total_claim_amount' as the last column.
columns_order = [col for col in concatenated_df.columns if col != 'total_claim_amount']
new_column_order = columns_order + ['total_claim_amount']
concatenated_df = concatenated_df[new_column_order]
display(concatenated_df)

In [None]:
# 17. Turn the `response` column values into (Yes=1/No=0).
concatenated_df['response'] = concatenated_df['response'].replace({'Yes':1, 'No':0})
display(concatenated_df.tail())

In [None]:
#18. Reduce the class imbalance in `education` by grouping together ["Master","Doctor"] into "Graduate" while keeping the other possible values as they are. In this way, you will reduce a bit the class imbalance at the price of losing a level of detail.
concatenated_df['education'].replace({'Master': 'Graduate', 'Doctor': 'Graduate'}, inplace=True)
display(concatenated_df)

In [None]:
# 19. Reduce the class imbalance of the `employmentstatus` column grouping together ["Medical Leave", "Disabled", "Retired"] into "Inactive" while keeping the other possible values as they are. In this way, you will reduce a bit the class imbalance at the price of losing a level of detail.
values_to_replace = ['Medical Leave', 'Disabled' , 'Retired']
concatenated_df['employment_status'] = concatenated_df['employment_status'].replace(values_to_replace, 'Inactive')
display(concatenated_df)

In [None]:
# 20. Deal with column `Gender` turning the values into (1/0).
concatenated_df['gender'] = concatenated_df['gender'].replace({'F':1, 'M':0})
display(concatenated_df)

In [None]:
# 21. Now, deal with `vehicle_class` grouping together "Sports Car", "Luxury SUV", and "Luxury Car" into a common group called `Luxury` leaving the other values as they are. In this way, you will reduce a bit the class imbalance at the price of losing a level of detail.
values_to_replace = ['Sports Car', 'Luxury SUV', 'Luxury Car']
concatenated_df['vehicle_class'] = concatenated_df['vehicle_class'].replace(values_to_replace, 'Luxury')
display(concatenated_df)

In [None]:
# 22. Now it's time to deal with the **categorical ordinal columns**, assigning a numerical value to each unique value respecting the ìmplicit ordering`. Encode the coverage: "Premium" > "Extended" > "Basic".
numerical_values = {"Basic": 1, "Extended": 2, "Premium": 3}
concatenated_df['coverage'] = concatenated_df['coverage'].replace(numerical_values)
display(concatenated_df)

In [None]:
# 23. Encode the column `employmentstatus` as: "Employed" > "Inactive" > "Unemployed".
numerical_values = {"Unemployed": 1, "Inactive": 2, "Employed": 3}
concatenated_df['employment_status'] = concatenated_df['employment_status'].replace(numerical_values)
display(concatenated_df)

In [None]:
# 24. Encode the column `location_code` as: "Urban" > "Suburban" > "Rural".
numerical_values = {"Rural": 1, "Suburban": 2, "Urban": 3}
concatenated_df['location_code'] = concatenated_df['location_code'].replace(numerical_values)
display(concatenated_df)

In [None]:
# 25. Encode the column `vehicle_size` as: "Large" > "Medsize" > "Small".
numerical_values = {"Small": 1, "Medsize": 2, "Large": 3}
concatenated_df['vehicle_size'] = concatenated_df['vehicle_size'].replace(numerical_values)
display(concatenated_df['vehicle_size'].value_counts())

In [None]:
# 26. Get a dataframe with the **categorical nominal columns**
categorical_nominal_df = concatenated_df.select_dtypes(include='object')
categorical_nominal_df.drop('education' , axis=1, inplace=True) #I will drop the eduaction column as it is nominal and not ordinal
display(categorical_nominal_df)

In [None]:
#27. Create a list of named `levels` which that has as many elements as categorical nominal columns. Each element must be another list with all the possible unique values of the corresponding categorical nominal column
levels= []
for col in categorical_nominal_df:
    unique_values = categorical_nominal_df[col].unique().tolist()
    levels.append(unique_values)
display(levels)

In [None]:
# 28. Instantiate an [sklearn OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder) with drop set to `first` and categories to `levels`
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first')
encoder.fit(categorical_nominal_df)
categorical_nom_array= encoder.transform(categorical_nominal_df).toarray()
categorical_nominal_encoded = pd.DataFrame(categorical_nom_array, columns=encoder.get_feature_names_out(), 
                           index=categorical_nominal_df.index)
display(categorical_nominal_encoded)

# Lab Comparing Regression Models

In [None]:
concatenated_df.head()

In [None]:
# Define X and y
X = concatenated_df.drop(['total_claim_amount'],axis=1)
y = concatenated_df['total_claim_amount']

In [None]:
# Import sklearn train_test_split and separate the data. Set test_size=0.30 and random_state=31
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=31)

In [None]:
# Separate X_train and X_test into numerical and categorical (X_train_cat , X_train_num , X_test_cat , X_test_num)
X_train_num, X_train_cat = split_df(X_train)
X_test_num, X_test_cat = split_df(X_test)
#display(X_train_num)
#display(X_train_cat)
#display(X_test_num)
#display(X_test_cat)

In [None]:
# Encode the categorical variables X_train_cat and X_test_cat using the OneHotEncoder setup in the previous lab. 
# Remember to use .toarray() after .transform() to endup with a numpy array. Next, cast the resulting numpy arrays into pandas DataFrames. 
# Make sure that the column names of the new dataframes are correctly setup using encoder.get_feature_names_out() and the same indexes of X_train_cat and X_test_cat

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='error',drop='first')
encoder.fit(X_train_cat)
X_train_cat_encoded_arr = encoder.transform(X_train_cat).toarray()
X_test_cat_encoded_arr  = encoder.transform(X_test_cat).toarray()

X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded_arr, columns=encoder.get_feature_names_out(), index=X_train_cat.index)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded_arr, columns=encoder.get_feature_names_out(),  index=X_test_cat.index)
#display(X_train_cat_encoded.head())
#display(X_test_cat_encoded.head())

In [None]:
# Use X_train_num to fit a power transformer. Transform BOTH X_train_num and X_test_num. 
# Next, cast the resulting numpy arrays as pandas dataframes. Make sure to set the correct columns names and to use the same indexes of X_train_num and X_test_num. 
# Name the final resulting dataframes as: X_train_num_transformed_df and X_test_num_transformed_df

from sklearn.preprocessing import PowerTransformer
power_transformer = PowerTransformer()
power_transformer.fit(X_train_num)

X_train_num_transformed_arr = power_transformer.transform(X_train_num)
X_test_num_transformed_arr = power_transformer.transform(X_test_num)

X_train_num_transformed_df = pd.DataFrame(X_train_num_transformed_arr,columns=X_train_num.columns, index=X_train_num.index )
X_test_num_transformed_df = pd.DataFrame(X_test_num_transformed_arr, columns=X_test_num.columns, index=X_test_num.index)

#display(X_train_num_transformed_df)
#display(X_test_num_transformed_df)

In [None]:
# Concat X_train_num_transformed_df and X_train_cat_encoded_df into X_train_new 
# and X_test_num_transformed_df and X_test_cat_encoded_df into X_test_new

X_train_new = pd.concat([X_train_num_transformed_df, X_train_cat_encoded_df], axis =1)
X_test_new = pd.concat([X_test_num_transformed_df, X_test_cat_encoded_df], axis =1)
#display(X_train_new.head())
#display(X_train_new.shape)
#display(X_test_new.head())
#display(X_test_new.shape)

In [None]:
# Fit a MinMax scaler using X_train_new and transform X_train_new and X_test_new. 
# Create new pandas dataframes from the resulting numpy arrays. Remember to set the correct columns names and indexes. 
# Name the resulting dataframes as: X_train_new_scaled_df and X_test_new_scaled_df

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaler.fit(X_train_new)
X_train_new_scaled_arr = scaler.transform(X_train_new)
X_test_new_scaled_arr = scaler.transform(X_test_new)

X_train_new_scaled_df = pd.DataFrame(X_train_new_scaled_arr, columns = X_train_new.columns, index=X_train_new.index)
X_test_new_scaled_df  = pd.DataFrame(X_test_new_scaled_arr,  columns = X_test_new.columns, index=X_test_new.index  )

#display(X_train_new_scaled_df)
#display(X_test_new_scaled_df)

In [None]:
# Train a simple linear regression model using X_train_new_scaled_df, and get the predictions for the train and test sets
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train_new_scaled_df,y_train)

In [None]:
y_pred_train = lm.predict(X_train_new_scaled_df)
y_pred_test = lm.predict(X_test_new_scaled_df)

In [None]:
#Create a function that given a model prediction and real values returns a pandas dataframe with the following table:

In [None]:
def calculate_error_metrics (y_true, y_pred) -> pd.DataFrame:
    from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
    MAE = mean_absolute_error(y_true, y_pred)
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = mean_squared_error(y_true, y_pred, squared=False)
    MAPE = mean_absolute_percentage_error(y_true, y_pred)
    R2= r2_score(y_true, y_pred)
    
    results = {"Error_metric": ['MAE', 'MSE', 'RMSE', 'MAPE', 'R2'],
               "Value": [MAE, MSE, RMSE, MAPE, R2]}

    results_df = pd.DataFrame(results).round(2)
    return results_df

In [None]:
# Evaluate the linear model predictions using the previous function on the TRAIN and TEST sets
calculate_error_metrics(y_train, y_pred_train)

In [None]:
calculate_error_metrics(y_test, y_pred_test)

In [None]:
# Now define a function that takes as an input: list of models, X_train and y_train 
# to train several model (with default values) so we can train a lot of them without repeating code. 
# The function must return the list of trained models.

def model_training (model_list: list, X_train, y_train) -> list:
    trained_models = []
    from sklearn.linear_model import LinearRegression
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.neural_network import MLPRegressor

    for model_name in model_list:
        valid_input = True
        if model_name == 'LinearRegression':
            model = LinearRegression()
        elif model_name == 'KNeighborsRegressor':
            model = KNeighborsRegressor()
        elif model_name  == 'MLPRegressor':
            model = MLPRegressor()
        else:
            print("Warning: Unrecognized model", model_name)
            valid_input = False
        
        if valid_input:
            model.fit(X_train, y_train)
            trained_models.append(model)
        
    return trained_models



In [None]:
# Use the function to train the following models (with default settings):
# LinearRegression, KNeighborsRegressor, MLPRegressor

[lm, knn, mlp]= model_training(['LinearRegression','KNeighborsRegressor', 'MLPRegressor'], X_train_new_scaled_df, y_train)

y_train_lm_pred = lm.predict(X_train_new_scaled_df)
y_train_knn_pred = knn.predict(X_train_new_scaled_df)
y_train_mlp_pred = mlp.predict(X_train_new_scaled_df)

y_test_lm_pred = lm.predict(X_test_new_scaled_df)
y_test_knn_pred = knn.predict(X_test_new_scaled_df)
y_test_mlp_pred = mlp.predict(X_test_new_scaled_df)

In [None]:
# Evaluate the models with the function created earlier in the TRAIN and TEST sets. 
# Which model performs best with the default options?
print("TRAIN SET:\n")
print("Linear Model:")
display(calculate_error_metrics(y_train, y_train_lm_pred))
print("\nKnearest neighbours:")
display(calculate_error_metrics(y_train, y_train_knn_pred))
print("\nMLP Regressor:")
display(calculate_error_metrics(y_train, y_train_mlp_pred))

print("\nTEST SET:\n")
print("Linear Model:")
display(calculate_error_metrics(y_test, y_test_lm_pred))
print("\nKnearest neighbours:")
display(calculate_error_metrics(y_test, y_test_knn_pred))
print("\nMLP Regressor:")
display(calculate_error_metrics(y_test, y_test_mlp_pred))

In [None]:
# In the train set, the K Nearest neighbor is showing the best model performance according to all error metrics except the MAPE is showing that the MLP regressor is performing best.
# In the test set, The MLP regressor is showing the best model performance according to all the error metrics