# Setup
In this notebook section, we will import the libraries needed to run this code.

In [1]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

# Constants
In a Jupyter Notebook, creating constant variables can be important for several reasons:

* **Readability and Maintainability**: Using constant variables with meaningful names can improve the readability of your code. It makes it easier for others (or even yourself in the future) to understand the purpose of the values being used throughout the notebook.

* **Code Consistency**: By defining constants, you ensure that specific values are consistently used across the notebook. If you need to change the value later, you only have to modify it in one place, reducing the risk of errors due to inconsistent values.

* **Preventing Magic Numbers**: Magic numbers are hardcoded numeric values scattered throughout the code without any explanation or context. Using constants instead of magic numbers makes the code self-documenting and provides context for the values used.

* **Flexibility**: If you need to change a value that is used in multiple places, having it defined as a constant allows you to change it once, and the change will automatically apply throughout the notebook.

* **Easy Debugging**: When debugging the code, having constants allows you to quickly check the values being used in different parts of the notebook without having to search for where they are defined.

* **Unit Testing**: If you plan to write unit tests for your code, using constants can make it easier to define test cases and assert expected results.

In [18]:
DATASETS_DIR = './data/'
URL = 'C:/Users/rbernal/Documents/GitHub/Proyecto/FAE/data/data_fire.csv'
RETRIEVED_DATA = 'data_fire.csv'


SEED_SPLIT = 404
TRAIN_DATA_FILE = DATASETS_DIR + 'train.csv'
TEST_DATA_FILE  = DATASETS_DIR + 'test.csv'

TARGET  = 'STATUS'
FEATURES = ['SIZE','FUEL','DISTANCE','DESIBEL','AIRFLOW','FREQUENCY']
CATEGORICAL_VARS = ['FUEL']
NUMERICAL_VARS = ['SIZE','DISTANCE','DESIBEL','AIRFLOW','FREQUENCY']

SEED_MODEL = 404

SELECTED_FEATURES = ['SIZE',
                     'FUEL', 
                     #'FUEL_gasoline',
                     'FUEL_lpg', 
                     'FUEL_kerosene',
                     'FUEL_thinner',
                     'DISTANCE','DESIBEL','AIRFLOW','FREQUENCY']

# Functions
Writing functions will help us for several things, for example:
* **Modularity**: Functions allow you to break down complex problems into smaller, manageable pieces. Each function can handle a specific task, making the code easier to understand, test, and maintain. This concept is known as "modularity."

* **Reusability**: Once you define a function, you can use it multiple times throughout your code or even in other projects. This promotes code reuse and saves time since you don't have to rewrite the same logic each time you need it.

In [None]:
def retrieve_data(self):

# Loading data from specific path
    data = pd.read_csv(url) 

    # Create directory if it does not exist
    if not os.path.exists(self.DATASETS_DIR):
            os.makedirs(self.DATASETS_DIR)
            print(f"Directory '{self.DATASETS_DIR}' created successfully.")
        else:
            print(f"Directory '{self.DATASETS_DIR}' already exists.")

    # Save data to CSV file
    data.to_csv(self.DATASETS_DIR + self.RETRIEVED_DATA, index=False)

    return f'Data stored in {self.DATASETS_DIR + self.RETRIEVED_DATA}'

data_retrieval(URL)

# Custom Transformers
Custom transformers are really important if we want to have high-quality code, able to be maintaned, changed and be reused by other pieces of code.

The following code is the migration from [3_create_convenient_classes.ipynb](3_create_convenient_classes.ipynb) notebook.

In [15]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to perform one-hot encoding for categorical variables.

    Parameters:
        variables (list or str, optional): List of column names (variables) to perform one-hot encoding for.
            If a single string is provided, it will be treated as a single variable. Default is None.

    Attributes:
        variables (list): List of column names (variables) to perform one-hot encoding for.
        dummies (list): List of column names representing the one-hot encoded dummy variables.

    Methods:
        fit(X, y=None):
            Calculates the one-hot encoded dummy variable columns for the specified categorical variables from the training data.
            It returns the transformer instance itself.

        transform(X):
            Performs one-hot encoding for the specified categorical variables and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    encoder = OneHotEncoder(variables=['category1', 'category2'])

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('encoder', encoder),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self, variables=None):
        """
        Initialize the OneHotEncoder transformer.

        Parameters:
            variables (list or str, optional): List of column names (variables) to perform one-hot encoding for.
                If a single string is provided, it will be treated as a single variable. Default is None.
        """
        self.variables = [variables] if not isinstance(variables, list) else variables

    def fit(self, X, y=None):
        """
        Calculates the one-hot encoded dummy variable columns for the specified categorical variables from the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            self (OneHotEncoder): The transformer instance.
        """
        self.dummies = pd.get_dummies(X[self.variables], drop_first=True).columns
        return self

    def transform(self, X):
        """
        Performs one-hot encoding for the specified categorical variables and returns the modified DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with one-hot encoded dummy variables for the specified categorical variables.
        """
        X = X.copy()
        X = pd.concat([X, pd.get_dummies(X[self.variables], drop_first=True)], axis=1)
        X.drop(self.variables, axis=1)

        # Adding missing dummies, if any
        missing_dummies = [var for var in self.dummies if var not in X.columns]
        if len(missing_dummies) != 0:
            for col in missing_dummies:
                X[col] = 0

        return X


In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to select specific features (columns) from a DataFrame.

    Parameters:
        feature_names (list or array-like): List of column names to select as features from the input DataFrame.

    Methods:
        fit(X, y=None):
            Placeholder method that returns the transformer instance itself.

        transform(X):
            Selects and returns the specified features (columns) from the input DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Define the feature names to be selected
    selected_features = ['feature1', 'feature2', 'feature3']

    # Instantiate the custom transformer
    feature_selector = FeatureSelector(feature_names=selected_features)

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('feature_selector', feature_selector),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """

    def __init__(self, feature_names):
        """
        Initialize the FeatureSelector transformer.

        Parameters:
            feature_names (list or array-like): List of column names to select as features from the input DataFrame.
        """
        self.feature_names = feature_names

    def fit(self, X, y=None):
        """
        Placeholder method that returns the transformer instance itself.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            self (FeatureSelector): The transformer instance.
        """
        return self

    def transform(self, X):
        """
        Selects and returns the specified features (columns) from the input DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_selected (pd.DataFrame): DataFrame containing only the specified features (columns).
        """
        return X[self.feature_names]


In [28]:
class OrderingFeatures(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to order features (columns) in the same order as they appeared in the training data.

    Parameters:
        None

    Attributes:
        ordered_features (pd.Index): Index of column names representing the order of features as they appeared in the training data.

    Methods:
        fit(X, y=None):
            Records the order of features from the training data and returns the transformer instance itself.

        transform(X):
            Reorders the features in the same order as they appeared in the training data and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    feature_orderer = OrderingFeatures()

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('feature_orderer', feature_orderer),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self):
        """
        Initialize the OrderingFeatures transformer.

        Parameters:
            None
        """
        return None

    def fit(self, X, y=None):
        """
        Records the order of features from the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            self (OrderingFeatures): The transformer instance.
        """
        if isinstance(X, pd.DataFrame):
            self.ordered_features = X.columns
            print(self.ordered_features)
        elif isinstance(X, np.ndarray):
            self.ordered_features = np.arange(X.shape[1])
        else:
            raise ValueError("Input X must be a pandas DataFrame or a numpy array.")
        return self

    def transform(self, X):
        """
        Reorders the features in the same order as they appeared in the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with features ordered as they appeared in the training data.
        """

        if isinstance(X, pd.DataFrame):
            # print(X[self.ordered_features])
            # print("return df")
            DROP_COLS_AFTER = ['FUEL']
            X[self.ordered_features]
            X.drop(DROP_COLS_AFTER, axis=1, inplace=True)
            return X
        elif isinstance(X, np.ndarray):
            # print("return np")
            return X[:, self.ordered_features]
        else:
            raise ValueError("Input X must be a pandas DataFrame or a numpy array.")


# Pipeline
The code below is a scikit-learn pipeline called fae_pipeline, that is used for data preprocessing and modeling for a Acoustic Extinguisher Fire dataset classification task. Each step in the pipeline corresponds to a specific data transformation or modeling step.

* **`OneHotEncoder`**: This is a custom transformer that performs one-hot encoding for categorical variables. It takes the CATEGORICAL_VARS as input, which represents a list of categorical column names to be one-hot encoded. It creates binary dummy variables for each category.

* **`OrderingFeatures`**: This is a custom transformer that orders the features (columns) in the same order as they appeared in the training data. It ensures that the order of columns in the transformed dataset is consistent with the order in which the pipeline was trained.

* **`MinMaxScaler`**: This step scales the numerical features to a specified range, typically between 0 and 1, using the Min-Max scaling technique.

* **`LogisticRegression`**: This is the final modeling step in the pipeline. It fits a logistic regression model to the preprocessed dataset. The model is specified with hyperparameters C=0.0005, class_weight='balanced', and random_state=SEED_MODEL. The C parameter is the regularization strength, 'balanced' sets the class weights to be inversely proportional to the class frequencies to handle class imbalance, and random_state is used for reproducibility.

In [29]:
df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)

X_train, X_test, y_train, y_test = train_test_split(
                                                        df.drop(TARGET, axis=1),
                                                        df[TARGET],
                                                        test_size=0.2,
                                                        random_state=404
                                                   )

In [30]:
transformations_pipeline = Pipeline(
                              [
                                ('dummy_vars', OneHotEncoder(variables=self.CATEGORICAL_VARS)),
                                ('feature_selector', FeatureSelector(self.SELECTED_FEATURES)),
                                ('aligning_feats', OrderingFeatures()),
                                ('scaling', MinMaxScaler()),
                              ])


In [None]:
X_train = transformations_pipeline.fit_transform(X_train)

In [None]:
logistic_regression = LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL)
logistic_regression.fit(X_train, y_train)

In [None]:
X_test = transformations_pipeline.fit_transform(X_test)

In [34]:
X_test.shape

(262, 13)

In [35]:
y_pred = logistic_regression.predict(X_test)

In [None]:
class_pred = logistic_regression.predict(X_test)
proba_pred = logistic_regression.predict_proba(X_test)[:,1]
print(f'test roc-auc : {roc_auc_score(y_test, proba_pred)}')
print(f'test accuracy: {accuracy_score(y_test, class_pred)}')
print()

# Persisting the trained model

In [54]:
import joblib

TRAINED_MODEL_DIR = 'trained_models/'
PIPELINE_NAME = 'logistic_regression'
PIPELINE_SAVE_FILE = f'{PIPELINE_NAME}_output.pkl'

# Save the model using joblib
save_path = TRAINED_MODEL_DIR + PIPELINE_SAVE_FILE
joblib.dump(logistic_regression, save_path)

['trained_models/logistic_regression_output.pkl']

# Predictions

**Basic input validation**

input_data = X_test.copy()

**Making predictions**

In [57]:
# Sample single input data in dictionary format
single_input_data = {
        "SIZE": 3,
        "FUEL_lpg": 0,
        "FUEL_kerosene": 1,
        "FUEL_thinner": 0,
        "DISTANCE": 100,
        "DESIBEL": 104,
        "AIRFLOW": 8.8,
        "FREQUENCY": 45
}
# Convert the single input data to a DataFrame
single_input_df = pd.DataFrame([single_input_data])

# Preprocess the single input data using the transformations_pipeline
preprocessed_single_input = transformations_pipeline.transform(single_input_df)

# Load the model using joblib
trained_model = joblib.load(save_path)

# Predict the target value using the loaded model
predicted_value = trained_model.predict(preprocessed_single_input)

print(predicted_value)

[1]


# Extra
Use this code to debug the Custom Transformer pipeline

In [None]:
# from sklearn.compose import ColumnTransformer

# # Define the debug_print function to print DataFrame or array
# def debug_print(X):
#     if isinstance(X, pd.DataFrame):
#         print(X.head())  # Print the first few rows of the DataFrame
#     elif isinstance(X, np.ndarray):
#         print(X[:5])  # Print the first 5 rows of the array
        

# # Define the preprocessor for categorical variables
# categorical_preprocessor = Pipeline([
#     ('categorical_imputer', CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
#     ('rare_labels', RareLabelCategoricalEncoder(tol=0.05, variables=CATEGORICAL_VARS)),
#     ('dummy_vars', OneHotEncoder(variables=CATEGORICAL_VARS))
# ])

# # Define the preprocessor for numerical variables
# numerical_preprocessor = Pipeline([
#     ('missing_indicator', MissingIndicator(variables=NUMERICAL_VARS)),
#     # ('cabin_only_letter', ExtractLetters()),
#     ('median_imputation', NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
#     ('scaling', MinMaxScaler())
# ])

# # Use ColumnTransformer to apply the different preprocessors to their respective columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('categorical', categorical_preprocessor, CATEGORICAL_VARS),
#         ('numerical', numerical_preprocessor, NUMERICAL_VARS)
#     ]
# )

# # Combine the preprocessor with the logistic regression model in the final pipeline
# titanic_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('aligning_feats', OrderingFeatures()),
#     ('log_reg', LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL))
# ])

# # Debug each output after transformation
# X_train_transformed = titanic_pipeline['preprocessor'].fit_transform(X_train)
# debug_print(X_train_transformed)

# # Fit the model
# titanic_pipeline['log_reg'].fit(X_train_transformed, y_train)