In [39]:
%pip install scikit-learn mlflow pandas category_encoders

Note: you may need to restart the kernel to use updated packages.


In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce
from typing import Dict, List, Tuple, Any

In [41]:
# Read "autos.csv" file into "data" DataFrame
data = pd.read_csv("autos.csv")

---
# Cleaning

In [42]:
# Remove duplicate rows from the "data" DataFrame
data = data.drop_duplicates()

# Print the number of missing values
print("NaN values in data: " + str(data.isnull().sum().sum()))

# Print the data type of each column
print(data.dtypes)


NaN values in data: 0
YEAR                  int64
MAKE                 object
MODEL                object
VEHICLE CLASS        object
ENGINE SIZE         float64
CYLINDERS             int64
TRANSMISSION         object
FUEL                 object
FUEL CONSUMPTION    float64
HWY (L/100 km)      float64
COMB (L/100 km)     float64
COMB (mpg)            int64
EMISSIONS             int64
dtype: object


In [43]:
# Target Encode the follwing columns
columns_to_encode = ["MAKE", "VEHICLE CLASS", "TRANSMISSION"]
encoder = ce.TargetEncoder(cols=columns_to_encode)
encoder.fit(data[columns_to_encode], data["COMB (L/100 km)"])
encoded_data = encoder.transform(data[columns_to_encode])
encoded_data.columns = [f"{col}_enc" for col in columns_to_encode]
data = pd.concat([encoded_data, data], axis=1)

In [44]:
# Recode the values in the "FUEL" column
data["FUEL"] = data["FUEL"].replace({"D": 1, "X": 2, "Z": 3, "N": 4, "E": 5})
# Remove the following columns from the "data" DataFrame
columns_to_remove = ["MODEL","MAKE", "VEHICLE CLASS", "FUEL CONSUMPTION", "HWY (L/100 km)", "COMB (mpg)", "EMISSIONS"]
data = data.drop(columns_to_remove, axis=1)

print(data.head())


   MAKE_enc  VEHICLE CLASS_enc  TRANSMISSION_enc  YEAR  ENGINE SIZE  \
0  9.398477           9.180804         12.304974  2000          1.6   
1  9.398477           9.180804          9.381036  2000          1.6   
2  9.398477           9.791304         10.849480  2000          3.2   
3  9.398477           9.791304         12.304974  2000          3.5   
4  9.398477           9.908146         12.304974  2000          1.8   

   CYLINDERS TRANSMISSION  FUEL  COMB (L/100 km)  
0          4           A4     2              8.1  
1          4           M5     2              7.6  
2          6          AS5     3             10.0  
3          6           A4     3             11.5  
4          4           A4     2              8.6  


---
# Feature Engeneering

In [45]:
# Create a new feature named "ENGINE_CYLINDER_RATIO" as the ratio of "ENGINE SIZE" to "CYLINDERS"
data["ENGINE_CYLINDER_RATIO"] = data["ENGINE SIZE"] / data["CYLINDERS"]

# Create a new feature named "TRANSMISSION_TYPE" indicating whether the car has an automatic transmission
data["TRANSMISSION_TYPE"] = data["TRANSMISSION"].apply(lambda x: 0 if "A" in x else 1)

# Remove the "TRANSMISSION" column from the "data" DataFrame
data = data.drop("TRANSMISSION", axis=1)

print(data.head())

   MAKE_enc  VEHICLE CLASS_enc  TRANSMISSION_enc  YEAR  ENGINE SIZE  \
0  9.398477           9.180804         12.304974  2000          1.6   
1  9.398477           9.180804          9.381036  2000          1.6   
2  9.398477           9.791304         10.849480  2000          3.2   
3  9.398477           9.791304         12.304974  2000          3.5   
4  9.398477           9.908146         12.304974  2000          1.8   

   CYLINDERS  FUEL  COMB (L/100 km)  ENGINE_CYLINDER_RATIO  TRANSMISSION_TYPE  
0          4     2              8.1               0.400000                  0  
1          4     2              7.6               0.400000                  1  
2          6     3             10.0               0.533333                  0  
3          6     3             11.5               0.583333                  0  
4          4     2              8.6               0.450000                  0  


---
# Split des Datensatzes

In [46]:

# Separate the target variable from the features
X = data.drop("COMB (L/100 km)", axis=1)
y = data["COMB (L/100 km)"]

# Split the dataset into training and temporary sets (85% training + validation, 15% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=2)

# Split the temporary set again into training and validation sets (70% / 15% of the total amount)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(0.15 / 0.85), random_state=2)

---
# ML Methode

In [47]:
# Set the MLflow experiment name
mlflow.set_experiment("random_forest_regressor_hyperparameter_tuning")

# Define hyperparameter search space
n_estimators = [10, 50, 100, 200]
max_depth = [None, 10, 20, 30]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
max_features = [1.0, 'sqrt']



2023/04/13 16:24:37 INFO mlflow.tracking.fluent: Experiment with name 'random_forest_regressor_hyperparameter_tuning' does not exist. Creating a new experiment.


---
# Training - Tuning

In [48]:

def train_rf_model(params: Dict[str, Any], X_train: pd.DataFrame, y_train: pd.Series,
                   X_val: pd.DataFrame, y_val: pd.Series, random_state: int = 1,
                   log_best_model: bool = False) -> float:
    """
    Train a RandomForestRegressor model with the specified hyperparameters, log the hyperparameters
    and the mean squared error using MLflow, and return the mean squared error. Optionally log
    the model if log_best_model is set to True.
    
    :param params: Dictionary containing the hyperparameters for the RandomForestRegressor model.
    :param X_train: Training dataset features.
    :param y_train: Training dataset target.
    :param X_val: Validation dataset features.
    :param y_val: Validation dataset target.
    :param random_state: Random state for reproducibility (default: 1).
    :param log_best_model: Log the model if set to True (default: False).
    :return: Mean squared error on the validation dataset.
    """
    with mlflow.start_run():
        # Create the model with the specified hyperparameters
        rf = RandomForestRegressor(n_estimators=params['n_estimators'],
                                   max_depth=params['max_depth'],
                                   min_samples_split=params['min_samples_split'],
                                   min_samples_leaf=params['min_samples_leaf'],
                                   max_features=params['max_features'],
                                   random_state=random_state)
        
        # Train the model
        rf.fit(X_train, y_train)
        
        # Predictions on the validation set
        y_pred = rf.predict(X_val)
        
        # Calculate the mean squared error
        mse = mean_squared_error(y_val, y_pred)
        
        # Log the hyperparameters and the mean squared error
        mlflow.log_params(params)
        mlflow.log_metric("mse", mse)
       
        
        # Log the model with the best parameter combination if log_best_model is True
        if log_best_model:
            mlflow.sklearn.log_model(rf, "random_forest_model")
        
        return mse



In [49]:

def find_best_hyperparameters(n_estimators: List[int], max_depth: List[int], min_samples_split: List[int],
                              min_samples_leaf: List[int], max_features: List[str]) -> Tuple[float, Dict[str, int]]:
    """
    Find the best hyperparameters for a RandomForest model using a grid search approach.
    
    :param n_estimators: List of possible values for n_estimators.
    :param max_depth: List of possible values for max_depth.
    :param min_samples_split: List of possible values for min_samples_split.
    :param min_samples_leaf: List of possible values for min_samples_leaf.
    :param max_features: List of possible values for max_features.
    :return: Tuple containing the best valid MSE and the best parameters as a dictionary.
    """
    best_mse = float('inf')
    best_params = None

    # Iterate through all possible combinations of hyperparameters
    for n in n_estimators:
        for d in max_depth:
            for s in min_samples_split:
                for l in min_samples_leaf:
                    for f in max_features:
                        params = {
                            'n_estimators': n,
                            'max_depth': d,
                            'min_samples_split': s,
                            'min_samples_leaf': l,
                            'max_features': f
                        }
                        
                        # Train the model with the current combination of hyperparameters and calculate the MSE
                        mse = train_rf_model(params, X_train, y_train, X_val, y_val, False)
                        
                        # Update the best MSE and parameters if the current MSE is lower
                        if mse < best_mse:
                            best_mse = mse
                            best_params = params

    return best_mse, best_params


In [50]:

# Call the find_best_hyperparameters function with the defined search space
best_mse, best_params = find_best_hyperparameters(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features)

# Print the best MSE and parameters found during the hyperparameter search
print("Best Valid MSE:", best_mse)
print("Best parameters:", best_params)


Best Valid MSE: 0.3454593919560358
Best parameters: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}


---
# Evaluation

In [51]:
# Create the model with the best hyperparameters
test_mse = train_rf_model(best_params, X_train_val, y_train_val, X_test, y_test, log_best_model=True)

print("Test MSE:", test_mse)

Test MSE: 0.3123275990714805
