In [2]:
try:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense
    print("TensorFlow and Keras imports successful.")
except ImportError as e:
    print(f"Error importing TensorFlow/Keras: {e}")


TensorFlow and Keras imports successful.


In [None]:
# A combination of all the functions used in this notebook for creating a pipeline
def combined_preprocessing(df, date_column='date', impute_column='dcoilwtico', impute_strategy='mean', columns_to_standardize=None):
    # Convert date column to datetime
    df[date_column] = pd.to_datetime(df[date_column])

    # Add date parts
    df['day'] = df[date_column].dt.day_name()
    df['month'] = df[date_column].dt.month
    df['week'] = df[date_column].dt.isocalendar().week.astype(int)
    df['year'] = df[date_column].dt.year

    # Analyze missing values and duplicates
    print("Proportion of missing values:")
    print((df.isnull().mean() * 100).round(2))
    print("\nNumber of duplicated rows:", df.duplicated().sum())

    # Impute missing values
    if impute_column in df.columns:
        imputer = SimpleImputer(strategy=impute_strategy)
        df[impute_column] = imputer.fit_transform(df[[impute_column]])
    else:
        print(f"Column '{impute_column}' not found in DataFrame. Skipping imputation step.")

    # Standardize certain columns
    if columns_to_standardize is not None:
        for column in columns_to_standardize:
            if column in df.columns:
                df[column] = df[column].apply(lambda x: 'Yes' if x is True else ('No' if x is False else ('Not Applicable' if pd.isna(x) else x)))
            else:
                print(f"Column '{column}' not found in DataFrame. Skipping standardization step for this column.")
    
     # Print value counts for standardized columns
    if columns_to_standardize is not None:
        for column in columns_to_standardize:
            if column in df.columns:
                print(df[column].value_counts())
                print()

    # Convert numeric columns to optimize memory usage
    float64_cols = df.select_dtypes(include=['float64'])
    df[float64_cols.columns] = float64_cols.astype('float32')
    float_cols = df.select_dtypes(include=['float'])
    df[float_cols.columns] = float_cols.apply(pd.to_numeric, downcast='float')
    int_cols = df.select_dtypes(include=['int'])
    df[int_cols.columns] = int_cols.apply(pd.to_numeric, downcast='integer')

    return df

# Define columns to standardize
columns_to_standardize = ['holiday_type', 'locale', 'locale_name', 'description', 'transferred']

# # Apply combined preprocessing function
# df_test_processed = combined_preprocessing(df_test, date_column='date', impute_column='dcoilwtico', columns_to_standardize=columns_to_standardize)
# print(df_test_processed)


In [None]:
# Creating the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('combined_preprocessing', FunctionTransformer(combined_preprocessing, 
                                                   kw_args={
                                                       'date_column': 'date', 
                                                       'impute_column': 'dcoilwtico', 
                                                       'columns_to_standardize': columns_to_standardize
                                                   }))
])

In [None]:
#  The data has been split already, the test data is different from the train data
# Read the first test data into a dataframe
test_1 = pd.read_csv(r"C:\Users\Safowaa\Documents\Azibiafrica\AzubiPython\The_Regression_Project\Project_data\test.csv")
test_1.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [None]:
# Read the second test data into a dataframe
test_2 = pd.read_csv(r"C:\Users\Safowaa\Documents\Azibiafrica\AzubiPython\The_Regression_Project\Project_data\sample_submission_test.csv")
test_2.head()

Unnamed: 0,id,sales
0,3000888,0.0
1,3000889,0.0
2,3000890,0.0
3,3000891,0.0
4,3000892,0.0


In [None]:
# combine all the test data into one for testing
df_test = pd.merge(test_1, test_2 , on='id', how='left')

# Display a info of the dataframe
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           28512 non-null  int64  
 1   date         28512 non-null  object 
 2   store_nbr    28512 non-null  int64  
 3   family       28512 non-null  object 
 4   onpromotion  28512 non-null  int64  
 5   sales        28512 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 1.3+ MB


` Tranforming the test data set `

In [None]:
# Apply the pipeline to the test data
preprocessing_pipeline.fit_transform(df_test)
df_test


Proportion of missing values:
id             0.0
date           0.0
store_nbr      0.0
family         0.0
onpromotion    0.0
sales          0.0
day            0.0
month          0.0
week           0.0
year           0.0
dtype: float64

Number of duplicated rows: 0
Column 'dcoilwtico' not found in DataFrame. Skipping imputation step.
Column 'holiday_type' not found in DataFrame. Skipping standardization step for this column.
Column 'locale' not found in DataFrame. Skipping standardization step for this column.
Column 'locale_name' not found in DataFrame. Skipping standardization step for this column.
Column 'description' not found in DataFrame. Skipping standardization step for this column.
Column 'transferred' not found in DataFrame. Skipping standardization step for this column.


Unnamed: 0,id,date,store_nbr,family,onpromotion,sales,day,month,week,year
0,3000888,2017-08-16,1,AUTOMOTIVE,0,0.0,Wednesday,8,33,2017
1,3000889,2017-08-16,1,BABY CARE,0,0.0,Wednesday,8,33,2017
2,3000890,2017-08-16,1,BEAUTY,2,0.0,Wednesday,8,33,2017
3,3000891,2017-08-16,1,BEVERAGES,20,0.0,Wednesday,8,33,2017
4,3000892,2017-08-16,1,BOOKS,0,0.0,Wednesday,8,33,2017
...,...,...,...,...,...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1,0.0,Thursday,8,35,2017
28508,3029396,2017-08-31,9,PREPARED FOODS,0,0.0,Thursday,8,35,2017
28509,3029397,2017-08-31,9,PRODUCE,1,0.0,Thursday,8,35,2017
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9,0.0,Thursday,8,35,2017


`Dataset Splitting`

In [None]:
# Split data for machine learning models
train_ml, val_ml = train_test_split(df_train, test_size=0.2, shuffle=True)

In [None]:
# Create a copy of the dataframe 
df_train_copy1 = df_train.copy()   #RandomForest
df_train_copy2 = df_train.copy()   #XGBoost
df_test_copy1 = df_test.copy()     #RandonForest
df_test_copy2 = df_test.copy()     #XGBoost

In [None]:
# Identify categorical and numerical columns
categorical_columns = ['City', 'family', 'day', 'state', 'store_type ','holiday_type', 'locale', 'locale_name', 'description', 'transferred' ]  

numerical_columns = df_train_copy1.columns.difference(categorical_columns + ['sales'])

In [None]:
# Identify categorical and numerical columns
categorical_columns = ['family', 'day']
numerical_columns = df_test_copy1.columns.difference(categorical_columns + ['sales'])

In [None]:
# Split data for RandomForest and XGBoost
train_ml1, val_ml1 = train_test_split(df_train_copy1, test_size=0.2, shuffle=True)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

In [None]:
# Split the data for machine learning model RandomForest

Xdf_train_copy1 = df_train_copy1.drop("sales", axis= 1)

ydf_train_copy1 = df_train_copy1["sales"]

Xdf_test_copy1 = df_test_copy1.drop("sales", axis= 1)

ydf_test_copy1 = df_test_copy1["sales"]

`Features Creation` **&** `Encoding `

1. For Time Series Models (ARIMA, SARIMA, ETS, Prophet):
- Time-based Features:
- Lag Features: Create features that represent past values of the target variable (e.g., sales) at different time lags.

In [None]:
df_train['lag_1'] = df_train['sales'].shift(1)
df_train['lag_2'] = df_train['sales'].shift(2)


- Lag features are created by shifting the target variable  (sales) by a certain number of time periods (lags). This means you use past values of the variable to predict future values.
- y including past values, the model can learn patterns such as trends and seasonality, improving its predictive accuracy.
- Lag features allow the model to understand how past values of the series affect the current value. This is important in time series data where previous observations can have a significant impact on future outcomes.

- Rolling Statistics: Calculate rolling means, sums, or standard deviations over a window.

In [None]:
df_train['rolling_mean_7'] = df_train['sales'].rolling(window=7).mean()


In [None]:
df_train['rolling_mean_7']

0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
              ...    
2805226    208.194427
2805227    226.701998
2805228    570.806141
2805229    523.916428
2805230    451.630713
Name: rolling_mean_7, Length: 2805231, dtype: float64

- Our  7-day rolling mean calculates the average of the current day and the previous six days.
- It smooths out short-term fluctuations and highlights longer-term trends or cycles in the data.
- It helps in identifying trends over time, making it easier to see patterns or changes in direction.
- Finally, helps to visualize trends in the data over a weekly period, reducing day-to-day variability.

- Seasonality and Trends:
- Seasonal Indicators: For SARIMA and ETS, use indicators for seasons or holidays.

In [None]:
df_train['is_holiday'] = df_train['holiday_type'].apply(lambda x: 1 if x == 'Holiday' else 0)


In [None]:
# Check for invalid dates in the date column
def check_invalid_dates(df, date_column='date'):
    try:
        pd.to_datetime(df[date_column])
        print("All dates are valid.")
    except Exception as e:
        print(f"Invalid date found: {e}")
        
check_invalid_dates(df_train)
check_invalid_dates(df_test)


All dates are valid.
All dates are valid.


In [None]:

# Identify and remove rows with invalid dates
def clean_invalid_dates(df, date_column='date'):
    df = df.reset_index()  # Reset index to ensure 'date' is a column
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    invalid_dates = df[df[date_column].isna()]
    if not invalid_dates.empty:
        print("Removing rows with invalid dates:")
        print(invalid_dates)
        df = df.dropna(subset=[date_column])
    return df.set_index(date_column)  # Set 'date' column back as index

df_train = clean_invalid_dates(df_train)
df_test = clean_invalid_dates(df_test)

# Ensure the date columns are in the correct format for Prophet
df_train['ds'] = df_train.index
df_train['y'] = df_train['sales']
df_test['ds'] = df_test.index
df_test['y'] = df_test['sales']

In [None]:
def align_columns(df_train, df_test):
    # Get columns in df_train that are not in df_test
    missing_cols = set(df_train.columns) - set(df_test.columns)
    
    # Add missing columns to df_test and fill them with zeros
    for col in missing_cols:
        df_test[col] = 0
    
    return df_test

In [None]:
df_test = align_columns(df_train, df_test)

- For Machine Learning Models (RandomForest, XGBoost):
- Lag Features and Rolling Statistics: As with time series models, these are useful for capturing temporal patterns.

- Product of Features: Create new features by combining existing features to capture interactions.

In [None]:
df_train_copy1['sales_onpromotion'] = df_train_copy1['sales'] * df_train_copy1['onpromotion']
df_test_copy1['sales_onpromotion'] = df_test_copy1['sales'] * df_test_copy1['onpromotion']

In [None]:

# Add missing columns in the test dataset and fill them with zeros
missing_cols = set(df_train_copy1.columns) - set(df_test_copy1.columns)
for col in missing_cols:
    df_test_copy1[col] = 0


In [None]:

# Ensure consistent columns in train and test sets
df_train_copy1, df_test_copy1 = df_train_copy1.align(df_test, join='inner', axis=1, fill_value=0)


In [None]:
# List of categorical columns to be encoded
categorical_columns = ['family', 'city', 'day', 'state', 'store_type', 'holiday_type', 'locale', 'locale_name', 'description', 'transferred']

# Function to create a mapping dictionary for each categorical column
def create_mapping(df_train, df_test, column):
    unique_values = pd.concat([df_train[column], df_test[column]]).unique()
    return {value: idx for idx, value in enumerate(unique_values)}

# Create a dictionary to hold mappings for all categorical columns
mappings = {col: create_mapping(df_train_copy1, df_test_copy1, col) for col in categorical_columns}

# Apply the mappings to both df_train and df_test
for col in categorical_columns:
    df_train_copy1[col] = df_train_copy1[col].map(mappings[col])
    df_test_copy1[col] = df_test_copy1[col].map(mappings[col])


`Features Scaling`


- Feature Scaling: Normalize or standardize numerical features to help improve model performance.

In [None]:
# Convert categorical features using OneHotEncoder
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

# Ensure numerical_columns do not include datetime columns and are present in both DataFrames
numerical_columns_train = Xdf_train_copy1.select_dtypes(include=['number']).columns
numerical_columns_test = Xdf_test_copy1.select_dtypes(include=['number']).columns
# Find common numerical columns
common_numerical_columns = list(set(numerical_columns_train) & set(numerical_columns_test))

In [None]:
Xdf_train_copy1 = Xdf_train_copy1.drop(columns=['cluster', 'transactions', 'dcoilwtico'])

In [None]:
# Feature Scaling for numerical features
scaler = StandardScaler()
Xdf_train_copy1[common_numerical_columns] = scaler.fit_transform(Xdf_train_copy1[common_numerical_columns])
Xdf_test_copy1[common_numerical_columns] = scaler.transform(Xdf_test_copy1[common_numerical_columns])

# Print the scaled DataFrames to verify
print(Xdf_train_copy1.head())
print(Xdf_test_copy1.head())


         id       date  store_nbr      family  onpromotion      day     month  \
0 -1.791702 2013-01-01  -0.124337  AUTOMOTIVE    -0.223197  Tuesday -1.562782   
1 -1.791701 2013-01-01  -0.124337   BABY CARE    -0.223197  Tuesday -1.562782   
2 -1.791700 2013-01-01  -0.124337      BEAUTY    -0.223197  Tuesday -1.562782   
3 -1.791699 2013-01-01  -0.124337   BEVERAGES    -0.223197  Tuesday -1.562782   
4 -1.791698 2013-01-01  -0.124337       BOOKS    -0.223197  Tuesday -1.562782   

       week      year     city        state store_type holiday_type    locale  \
0 -1.658371 -1.419608  Salinas  Santa Elena          D      Holiday  National   
1 -1.658371 -1.419608  Salinas  Santa Elena          D      Holiday  National   
2 -1.658371 -1.419608  Salinas  Santa Elena          D      Holiday  National   
3 -1.658371 -1.419608  Salinas  Santa Elena          D      Holiday  National   
4 -1.658371 -1.419608  Salinas  Santa Elena          D      Holiday  National   

  locale_name         desc

In [None]:
# Temporarily reset the index to align the data
Xdf_train_copy1_reset = Xdf_train_copy1.reset_index()
Xdf_test_copy1_reset = Xdf_test_copy1.reset_index()
df_train_copy1_reset = df_train_copy1.reset_index()
df_test_copy1_reset = df_test_copy1.reset_index()

# Transfer 'sales' column to Xdf_train_copy1 and Xdf_test_copy1
Xdf_train_copy1_reset['sales'] = df_train_copy1_reset['sales']
Xdf_test_copy1_reset['sales'] = df_test_copy1_reset['sales']

# Restore the original date index
Xdf_train_copy1 = Xdf_train_copy1_reset.set_index('date')
Xdf_test_copy1 = Xdf_test_copy1_reset.set_index('date')

 Categorical Encoding:

- One-Hot Encoding: For categorical variables such as family, city, state

In [None]:
# Convert categorical features using OneHotEncoder
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

Xdf_train_copy1 = encoder.fit_transform(Xdf_train_copy1)
Xdf_test_copy1 = encoder.fit_transform(Xdf_test_copy1)

# Align the train and test sets
# Xdf_train_copy1_encoded, Xdf_test_copy1_encoded = Xdf_train_copy1_encoded.align(Xdf_test_copy1_encoded, join='left', axis=1, fill_value=0)


## Machine Learning Modeling 
### Model Creation

In [None]:
models = {}

models['ARIMA'] = lambda df: ARIMA(df['sales'], order=(5,1,0)).fit()
models['SARIMA'] = lambda df: SARIMAX(df['sales'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)).fit()
models['ETS'] = lambda df: ExponentialSmoothing(df['sales'], seasonal='add', seasonal_periods=12).fit()
models['Prophet'] = lambda df: Prophet().fit(pd.DataFrame({'ds': df.index, 'y': df['sales']}))

In [None]:
def batch_train_predict(model, X_train, y_train, X_test, batch_size=10000):
    # Shuffle the training data
    X_train, y_train = shuffle(X_train, y_train)

    # Create empty lists to hold predictions
    val_predictions = []
    test_predictions = []

    # Determine the number of batches
    num_batches = int(np.ceil(X_train.shape[0] / batch_size))

    # Train in batches
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, X_train.shape[0])
        
        X_batch = X_train[start_idx:end_idx]
        y_batch = y_train[start_idx:end_idx]
        
        # Fit the model on the batch
        model.fit(X_batch, y_batch)

        # Predict on validation and test data
        val_pred_batch = model.predict(X_test)
        test_pred_batch = model.predict(X_test)

        val_predictions.extend(val_pred_batch)
        test_predictions.extend(test_pred_batch)

    return np.array(val_predictions), np.array(test_predictions)

In [None]:
def train_and_predict(model_name, df_train, val_ml, df_test):
    # Ensure the model name exists in the dictionary
    if model_name not in models:
        raise ValueError(f"Model '{model_name}' is not defined in the models dictionary.")
    
    # Initialize the model
    model = models[model_name](df_train)

    # Check model type and make predictions
    if model_name in ['ARIMA', 'SARIMA', 'ETS']:
        val_predictions = model.forecast(len(val_ml))
        test_predictions = model.forecast(len(df_test))
    elif model_name == 'Prophet':
        future_val = pd.DataFrame({'ds': val_ml.index})
        future_test = pd.DataFrame({'ds': df_test.index})
        val_predictions = model.predict(future_val)['yhat']
        test_predictions = model.predict(future_test)['yhat']
    else:  # For machine learning models
        val_predictions = model.predict(val_ml.drop('sales', axis=1))
        test_predictions = model.predict(df_test.drop('sales', axis=1))
    
    return val_predictions, test_predictions


## Model **1)**   ARIMA



### Call the Model

In [None]:
model_name = 'ARIMA' 

### Evaluate Model on Validation and testing data set

In [None]:
val_predictions, test_predictions = train_and_predict(model_name, df_train, val_ml, df_test)

### View Results

In [None]:
print("Validation Predictions:", val_predictions)
print("Test Predictions:", test_predictions)

Validation Predictions: 2805231    394.162823
2805232    745.195494
2805233    787.659829
2805234    722.388079
2805235    354.675957
              ...    
3366273    576.338300
3366274    576.338300
3366275    576.338300
3366276    576.338300
3366277    576.338300
Name: predicted_mean, Length: 561047, dtype: float64
Test Predictions: 2805231    394.162823
2805232    745.195494
2805233    787.659829
2805234    722.388079
2805235    354.675957
              ...    
2833738    576.338300
2833739    576.338300
2833740    576.338300
2833741    576.338300
2833742    576.338300
Name: predicted_mean, Length: 28512, dtype: float64


## Model **3)** ETS

### Call the Model

In [None]:
model_name = 'ETS'

### Evaluate Model on Validation and testing data set

In [None]:
val_predictions, test_predictions = train_and_predict(model_name, df_train, val_ml, df_test)

### View Results

In [None]:
print("Validation Predictions:", val_predictions)
print("Test Predictions:", test_predictions)

Validation Predictions: 2805231    1092.234801
2805232     220.136296
2805233     172.646969
2805234    1294.874635
2805235     236.930000
              ...     
3366273    1099.836149
3366274     228.045084
3366275     181.546279
3366276    1019.008406
3366277     235.275733
Length: 561047, dtype: float64
Test Predictions: 2805231    1092.234801
2805232     220.136296
2805233     172.646969
2805234    1294.874635
2805235     236.930000
              ...     
2833738     228.045084
2833739     181.546279
2833740    1019.008406
2833741     235.275733
2833742     163.793417
Length: 28512, dtype: float64


## Model **4)** Prophet

### Call the model

In [None]:
model_name ='Prophet'

### Evaluate Model on Validation and testing data set

In [None]:
val_predictions, test_predictions = train_and_predict(model_name, df_train, val_ml, df_test)

### View Results

In [None]:
print("Validation Predictions:", val_predictions)
print("Test Predictions:", test_predictions)

Validation Predictions: 0         259.430920
1         259.430920
2         259.430920
3         259.430920
4         259.430920
             ...    
561042    415.830814
561043    415.830814
561044    415.830814
561045    415.830814
561046    415.830814
Name: yhat, Length: 561047, dtype: float64
Test Predictions: 0        431.278407
1        431.278407
2        431.278407
3        431.278407
4        431.278407
            ...    
28507    421.910667
28508    421.910667
28509    421.910667
28510    421.910667
28511    421.910667
Name: yhat, Length: 28512, dtype: float64


## Model **5)** RandomForest

In [None]:
def batch_train_predict(model, Xdf_train_copy1, val_ml1, Xdf_test_copy1, batch_size=10000):
    # Shuffle the training data
    Xdf_train_copy1 = shuffle(Xdf_train_copy1)

    # Create empty lists to hold predictions
    val_predictions = []
    test_predictions = []

    # Determine the number of batches
    num_batches = int(np.ceil(Xdf_train_copy1.shape[0] / batch_size))

    # Train in batches
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, Xdf_train_copy1.shape[0])
        
        batch = Xdf_train_copy1.iloc[start_idx:end_idx]
        
        # Ensure the batch has no datetime columns
        batch = batch.select_dtypes(exclude=['datetime64'])
        
        # Fit the model on the batch
        model.fit(batch, batch['sales'])

        # Predict on validation and test data
        val_pred_batch = model.predict(val_ml1.drop('sales', axis=1))
        test_pred_batch = model.predict(Xdf_test_copy1.drop('sales', axis=1))

        val_predictions.extend(val_pred_batch)
        test_predictions.extend(test_pred_batch)

    return np.array(val_predictions), np.array(test_predictions)


In [None]:

# Define models with batch processing
models = {}

models['RandomForest'] = lambda df: batch_train_predict(RandomForestRegressor(n_estimators=100), Xdf_train_copy1, val_ml1, Xdf_test_copy1, batch_size=10000)
models['XGBoost'] = lambda df: batch_train_predict(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100), df, val_ml1, Xdf_test_copy1, batch_size=10000)


### Call the model

In [None]:

# Train and evaluate models
model_name = 'RandomForest'


### Evaluate Model on Validation and testing data set

In [None]:
val_predictions, test_predictions = train_and_predict(model_name, df_train_copy1, val_ml1, df_test_copy1)


ValueError: could not convert string to float: 'EGGS'

### View Results

In [None]:
print("Validation Predictions:", val_predictions)
print("Test Predictions:", test_predictions)

## Model **6)** XGBoost

### Call the model

In [None]:
model_name ='XGBoost'

### Evaluate Model on Validation and testing data set

In [None]:
val_predictions, test_predictions = train_and_predict(model_name, df_train_copy1, val_ml1, df_test_copy1)

### View Results

In [None]:
print("Validation Predictions:", val_predictions)
print("Test Predictions:", test_predictions)

## Models comparison


In [None]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return mae, mse, rmse

In [None]:
results = []

for model_name in models.keys():
    try:
        val_predictions, test_predictions = train_and_predict(model_name, df_train, val_ml, df_test)
        mae, mse, rmse = evaluate_model(val_ml['sales'], val_predictions)
        results.append({
            'Model_Name': model_name,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'Details': 'Validation Set'
        })
    except Exception as e:
        print(f"Error with model {model_name}: {e}")

In [None]:
results1 = []

for model_name in models.keys():
    try:
        val_predictions, test_predictions = batch_train_predict(model_name, df_train_copy1, val_ml1, df_test_copy1)
        mae, mse, rmse = evaluate_model(val_ml1['sales'], val_predictions)
        results.append({
            'Model_Name': model_name,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'Details': 'Validation Set'
        })
    except Exception as e:
        print(f"Error with model {model_name}: {e}")

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [None]:
# Convert results to DataFrame
results_df1 = pd.DataFrame(results1)

In [None]:
# Sort the DataFrame by MAE (or any other metric)
results_df = results_df.sort_values(by='MAE')
print(results_df)

Create a pandas dataframe that will allow you to compare your models.


|     | Model_Name     | Metric (metric_name)    | Details  |
|:---:|:--------------:|:--------------:|:-----------------:|
| 0   |  -             |  -             | -                 |
| 1   |  -             |  -             | -                 |


You might use the pandas dataframe method `.sort_values()` to sort the dataframe regarding the metric.

## Hyperparameters tuning 

Fine-tune the Top-k models (3 < k < 5) using a ` GridSearchCV`  (that is in sklearn.model_selection
) to find the best hyperparameters and achieve the maximum performance of each of the Top-k models, then compare them again to select the best one.

# Export key components
Here is the section to **export** the important ML objects that will be use to develop an app: *Encoder, Scaler, ColumnTransformer, Model, Pipeline, etc*.