# Notebook per regressione lineare

Di seguito:
- verrà implementato l'algoritmo di regressione logistica

## Scaricamento dei dati

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import requests
import zipfile
import io
import os
import glob
import pandas as pd

class DataDownloaderExtractor(BaseEstimator, TransformerMixin):
    """
    A custom transformer to download and extract data from given URLs.

    Args:
        urls (list): A list of URLs to zip files.
        output_dir (str): The directory to save the extracted files.
    """
    def __init__(self, urls, output_dir="dataset"):
        self.urls = urls
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def fit(self, X, y=None):
        """
        Fits the transformer. In this case, it's a no-op as there's nothing to fit.

        Args:
            X: Input data (ignored).
            y: Target data (ignored).

        Returns:
            self: The fitted transformer instance.
        """
        return self

    def transform(self, X):
        """
        Downloads and extracts data from the provided URLs.

        Args:
            X: Input data (ignored).

        Returns:
            list: A list of paths to the extracted CSV files.
        """
        for url in self.urls:
            try:
                print(f"Downloading {url}...")
                response = requests.get(url, stream=True)
                response.raise_for_status() # Raise an exception for bad status codes

                # Read the zip file from the response content
                with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
                    # Extract all contents to the specified output directory
                    zip_ref.extractall(self.output_dir)
                    print(f"Extracted files from {url} to {self.output_dir}")

            except requests.exceptions.RequestException as e:
                print(f"Error downloading {url}: {e}")
            except zipfile.BadZipFile:
                print(f"Error: The downloaded file from {url} is not a valid zip file.")
            except Exception as e:
                print(f"An unexpected error occurred: {e}")

        print("Download and extraction complete.")
        csv_files = glob.glob(os.path.join(self.output_dir, "*.csv"))
        print("CSV files found:", csv_files)
        return csv_files

# Example usage with a scikit-learn pipeline
from sklearn.pipeline import Pipeline

# List of URLs to your zipped files on AWS
urls = [
    "https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip",
]

# Create the custom transformer
downloader_extractor = DataDownloaderExtractor(urls=urls, output_dir="downloaded_data")

# You can add other steps to the pipeline if needed, e.g., a data loader
# For this example, we just have the download and extract step

pipeline = Pipeline([
    ('download_and_extract', downloader_extractor),
    # Add more steps here if needed, e.g., loading the CSV files into pandas DataFrames
])

# Run the pipeline
# The fit method is called first (though it does nothing in this transformer)
# Then the transform method is called to perform the download and extraction
extracted_files = pipeline.fit_transform(None) # Pass None as input data, as it's not used

print("\nPipeline execution complete.")
print("Extracted files:", extracted_files)


Downloading https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip...
Extracted files from https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip to downloaded_data
Download and extraction complete.
CSV files found: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']

Pipeline execution complete.
Extracted files: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']


## Estrazione dei dati

In [2]:
class CSVLoader(BaseEstimator, TransformerMixin):
    """
    A custom transformer to load specific CSV files into pandas DataFrames
    and return them in a format suitable for scikit-learn pipelines (e.g., as a tuple).

    Assumes that the input X is a list of file paths, typically produced
    by a previous step in the pipeline.

    Args:
        x_filename (str): The base name of the file containing features (e.g., 'X_train.csv').
        y_filename (str): The base name of the file containing the target (e.g., 'y_train.csv').
    """
    def __init__(self, x_filename='X_train.csv', y_filename='y_train.csv'):
        self.x_filename = x_filename
        self.y_filename = y_filename
        self.x_data = None
        self.y_data = None

    def fit(self, X, y=None):
        """
        Fits the transformer. This method will load the data.

        Args:
            X: A list of file paths (expected to contain x_filename and y_filename).
            y: Target data (ignored).

        Returns:
            self: The fitted transformer instance.
        """
        x_file_path = None
        y_file_path = None

        # Find the correct file paths in the input list
        for file_path in X:
            if os.path.basename(file_path) == self.x_filename:
                x_file_path = file_path
            elif os.path.basename(file_path) == self.y_filename:
                y_file_path = file_path

        if x_file_path is None:
            raise FileNotFoundError(f"Could not find {self.x_filename} in the provided file list.")
        if y_file_path is None:
             raise FileNotFoundError(f"Could not find {self.y_filename} in the provided file list.")

        try:
            print(f"Loading {x_file_path} into x_data...")
            self.x_data = pd.read_csv(x_file_path)
            print(f"Loading {y_file_path} into y_data...")
            self.y_data = pd.read_csv(y_file_path) # Or read_fwf depending on the format
            print("Data loading complete.")

        except FileNotFoundError as e:
            print(f"Error loading file: {e}")
            # You might want to re-raise the exception or handle it differently
            raise
        except pd.errors.EmptyDataError:
            print(f"Error: One of the files ({self.x_filename} or {self.y_filename}) is empty.")
            raise
        except Exception as e:
            print(f"An unexpected error occurred while loading data: {e}")
            raise

        return self

    def transform(self, X):
        """
        Returns the loaded data (x_data, y_data).

        Args:
            X: Input data (ignored, data is loaded in fit).

        Returns:
            tuple: A tuple containing (x_data, y_data) as pandas DataFrames.
        """
        if self.x_data is None or self.y_data is None:
             raise RuntimeError("Data has not been loaded yet. Call fit() first.")

        # Return the data in a format that can be passed to the next pipeline step
        # For scikit-learn estimators, the fit method usually expects X and y separately.
        # Returning a tuple (X, y) allows the next step's fit method to receive them.
        return (self.x_data, self.y_data)

# Add the CSVLoader to the pipeline
pipeline_with_loading = Pipeline([
    ('download_and_extract', downloader_extractor),
    ('load_csv', CSVLoader(x_filename='X_train.csv', y_filename='y_train.csv'))
    # Add more steps here, e.g., preprocessing, model training
])

# Run the pipeline
# The output of the 'load_csv' step will be a tuple (x_data, y_data)
loaded_data = pipeline_with_loading.fit_transform(None)

# Access the loaded data
x_data, y_data = loaded_data

print("\nLoaded x_data shape:", x_data.shape)
print("Loaded y_data shape:", y_data.shape)
print("\nFirst 5 rows of x_data:")
print(x_data.head())
print("\nFirst 5 rows of y_data:")
print(y_data.head())

Downloading https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip...
Extracted files from https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip to downloaded_data
Download and extraction complete.
CSV files found: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']
Loading downloaded_data/X_train.csv into x_data...
Loading downloaded_data/y_train.csv into y_data...
Data loading complete.

Loaded x_data shape: (742625, 8)
Loaded y_data shape: (742625, 3)

First 5 rows of x_data:
   id  trq_measured       oat       mgt         pa       ias         np  \
0   0        54.100   2.00000  544.5000   212.1408  74.56250   89.18000   
1   1        49.625  24.22231  578.4844  1625.6400  30.35596   99.55273   
2   2        52.000   7.00000  566.1000  1912.9250  65.62500  100.14000   
3   3        62.400   7.25000  560.1000   277.0632  54.81250   90.64000   
4   4        62.900  23.25000  593.7000    53.6448  73.43750   99.91000   

     

## Creazione train-set e test-set

In [3]:
from sklearn.model_selection import train_test_split

class DataPreprocessor(BaseEstimator, TransformerMixin):
    """
    A custom transformer to preprocess the data:
    - Drop the 'id' column from x_data.
    - Merge the 'faulty' column from y_data into x_data.
    - Split the merged data into training and testing sets.

    Assumes the input is a tuple (x_data, y_data) as pandas DataFrames,
    typically from a previous pipeline step like CSVLoader.
    """
    def __init__(self, test_size=0.2, random_state=None):
        self.test_size = test_size
        self.random_state = random_state
        self.data_train = None
        self.data_test = None

    def fit(self, X, y=None):
        """
        Fits the transformer by preprocessing and splitting the data.

        Args:
            X: A tuple (x_data, y_data) where x_data is the features DataFrame
               and y_data is the target DataFrame.
            y: Target data (ignored, as the target is expected in y_data).

        Returns:
            self: The fitted transformer instance.
        """
        if not isinstance(X, tuple) or len(X) != 2:
            raise TypeError("Input X must be a tuple (x_data, y_data).")

        x_data, y_data = X

        if not isinstance(x_data, pd.DataFrame) or not isinstance(y_data, pd.DataFrame):
             raise TypeError("Both elements in the input tuple must be pandas DataFrames.")

        # Drop the 'id' column from x_data if it exists
        if 'id' in x_data.columns:
            print("Dropping 'id' column from x_data...")
            x_data_processed = x_data.drop('id', axis=1)
        else:
            print("'id' column not found in x_data. Skipping drop.")
            x_data_processed = x_data.copy()

        # Check if 'faulty' column exists in y_data and merge it
        if 'faulty' in y_data.columns:
            print("Merging 'faulty' column from y_data into x_data...")
            # Ensure dataframes can be merged, e.g., they have a common index or column
            # Assuming they can be concatenated side-by-side based on index
            # If merging by a specific column is needed, adjust here
            merged_data = pd.concat([x_data_processed, y_data['faulty']], axis=1)
        else:
             raise ValueError("'faulty' column not found in y_data.")

        print(f"Splitting data into train ({1-self.test_size:.0%}) and test ({self.test_size:.0%})...")
        # Split the merged data into training and testing sets
        self.data_train, self.data_test = train_test_split(
            merged_data,
            test_size=self.test_size,
            random_state=self.random_state
        )
        print("Data splitting complete.")

        return self

    def transform(self, X):
        """
        Returns the split training and testing data.

        Args:
            X: Input data (ignored, splitting is done in fit).

        Returns:
            tuple: A tuple containing (data_train, data_test) as pandas DataFrames.
        """
        if self.data_train is None or self.data_test is None:
             raise RuntimeError("Data has not been preprocessed or split yet. Call fit() first.")

        # Return the split data
        return (self.data_train, self.data_test)

# Extend the existing pipeline to include the DataPreprocessor
pipeline_with_preprocessing = Pipeline([
    ('download_and_extract', downloader_extractor),
    ('load_csv', CSVLoader(x_filename='X_train.csv', y_filename='y_train.csv')),
    ('preprocess_and_split', DataPreprocessor(test_size=0.2, random_state=42)) # Add the preprocessor
    # Add more steps here, e.g., feature scaling, model training
])

# Run the pipeline
# The output of the 'preprocess_and_split' step will be a tuple (data_train, data_test)
split_data = pipeline_with_preprocessing.fit_transform(None)

# Access the split data
data_train, data_test = split_data

print("\nProcessed and Split Data:")
print("data_train shape:", data_train.shape)
print("data_test shape:", data_test.shape)


Downloading https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip...
Extracted files from https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip to downloaded_data
Download and extraction complete.
CSV files found: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']
Loading downloaded_data/X_train.csv into x_data...
Loading downloaded_data/y_train.csv into y_data...
Data loading complete.
Dropping 'id' column from x_data...
Merging 'faulty' column from y_data into x_data...
Splitting data into train (80%) and test (20%)...
Data splitting complete.

Processed and Split Data:
data_train shape: (594100, 8)
data_test shape: (148525, 8)


In [4]:
# Useful package
import requests
import zipfile
import io
import os

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns









def transform_with_custom_root(df, column_name, root_degree):
  """
  Applies a custom root transformation (1/root_degree power) to a column.
  Handles positive, negative, and zero values appropriately based on the root degree.

  Args:
    df (pd.DataFrame): The input DataFrame.
    column_name (str): The name of the column to transform.
    root_degree (float): The degree of the root (e.g., 2 for square root, 3 for cube root).

  Returns:
    pd.DataFrame: The DataFrame with the transformed column.
  """
  new_column_name = f'{column_name}_root_{root_degree:.2f}_transformed'

  if root_degree == 0:
      raise ValueError("Root degree cannot be zero.")
  elif root_degree % 2 == 0:  # Even root
      # For even roots, we can only take the root of non-negative numbers
      if (df[column_name] < 0).any():
          print(f"Warning: Column '{column_name}' contains negative values. Cannot apply even root directly.")
          # You might choose to handle this by taking the root of the absolute value,
          # or setting negative values to NaN, depending on your data context.
          # Here, we'll take the root of the absolute value for demonstration.
          df[new_column_name] = np.power(np.abs(df[column_name]), 1/root_degree)
      else:
          df[new_column_name] = np.power(df[column_name], 1/root_degree)
  else:  # Odd root
      # Odd roots can handle positive, negative, and zero values
      df[new_column_name] = np.sign(df[column_name]) * np.power(np.abs(df[column_name]), 1/root_degree)

  return df

# Example usage with a custom root (e.g., 1.5)
# custom_root_degree = 2.35
# data_train = transform_with_custom_root(data_train.copy(), 'power_avail', custom_root_degree)


def create_binned_qualitative_variable(df, column_name, num_bins, strategy='quantile'):
  """
  Creates a qualitative (categorical) variable by binning a numerical column.

  Args:
    df (pd.DataFrame): The input DataFrame.
    column_name (str): The name of the numerical column to bin.
    num_bins (int): The desired number of bins.
    strategy (str): The strategy to use for binning. 'quantile' uses quantiles
                    to ensure bins have approximately equal numbers of observations.
                    'uniform' creates bins with equal widths. Default is 'quantile'.

  Returns:
    pd.DataFrame: The DataFrame with a new qualitative column.
                  The new column name will be f'{column_name}_binned_{num_bins}_{strategy}'.
  """
  if column_name not in df.columns:
    raise ValueError(f"La colonna '{column_name}' non è presente nel DataFrame.")
  if num_bins <= 1:
      raise ValueError("Il numero di bins deve essere maggiore di 1.")

  new_column_name = f'{column_name}_binned_{num_bins}_{strategy}'

  if strategy == 'quantile':
    # Use qcut to create bins based on quantiles (approximately equal number of observations)
    # `duplicates='drop'` handles cases where quantile boundaries are not unique,
    # which can happen with skewed or discrete data.
    df[new_column_name] = pd.qcut(df[column_name], q=num_bins, labels=False, duplicates='drop')
  elif strategy == 'uniform':
    # Use cut to create bins of equal width
    df[new_column_name] = pd.cut(df[column_name], bins=num_bins, labels=False, include_lowest=True)
  else:
    raise ValueError(f"Strategia di binning non valida: '{strategy}'. Scegliere tra 'quantile' o 'uniform'.")

  # Convert the binned column to object/category type if needed, or keep as int for simplicity
  # Here we keep it as int representing the bin number

  return df

# Example usage for 'indicated_air_speed':
# num_bins_indicated_air_speed = 5 # Define the number of bins
# binning_strategy = 'quantile' # Or 'uniform'

#data_train = create_binned_qualitative_variable(
#    data_train.copy(),
#    'indicated_air_speed',
#    num_bins_indicated_air_speed,
#    strategy=binning_strategy
#)




## PCA per indicated_air_speed e compressor_speed
# Select the columns for PCA
# features_for_pca = data_train[['compressor_speed', 'net_power']]
# Initialize PCA with 1 component (to combine the two variables)
# pca = PCA(n_components=1)
# Fit PCA on the selected features and transform them
# data_train['compressor_speed_net_power_pca'] = pca.fit_transform(features_for_pca)



## Creazione di torque_times_temp

# data_train['torque_times_temp'] = data_train['torque_meas'] * data_train['outside_air_temp']










# Creazione pipeline
def prepare_data_pipeline(x_path, y_path, new_column_names=None,
                          root_transformations=None,
                          binning_config=None,
                          standardize=True,
                          drop_index_col='idx'):
    """
    Esegue la pipeline completa di preprocessing.

    Args:
        x_path (str): path al file X_train.csv
        y_path (str): path al file y_train.csv
        new_column_names (list): lista di nuovi nomi colonne (opzionale)
        root_transformations (dict): dict {colonna: radice}
        binning_config (dict): dict {colonna: (num_bins, strategia)}
        standardize (bool): se standardizzare le colonne numeriche
        drop_index_col (str): nome della colonna da droppare (opzionale)

    Returns:
        pd.DataFrame: DataFrame preprocessato pronto per il training
    """
    df = load_training_data(x_path, y_path)

    if new_column_names:
        df = rename_dataframe_columns(df, new_column_names + ['y_target'])

    if drop_index_col in df.columns:
        df = df.drop(drop_index_col, axis=1)

    # Trasformazioni custom root
    if root_transformations:
        for col, deg in root_transformations.items():
            df = transform_with_custom_root(df, col, deg)

    # Binning
    if binning_config:
        for col, (n_bins, strategy) in binning_config.items():
            df = create_binned_qualitative_variable(df, col, n_bins, strategy)

    # PCA: esempio hardcoded ma puoi parametrizzare se vuoi
    if {'compressor_speed', 'net_power'}.issubset(df.columns):
        pca = PCA(n_components=1)
        df['compressor_speed_net_power_pca'] = pca.fit_transform(df[['compressor_speed', 'net_power']])

    # Feature engineering manuale
    if {'torque_meas', 'outside_air_temp'}.issubset(df.columns):
        df['torque_times_temp'] = df['torque_meas'] * df['outside_air_temp']

    # Rimuovi colonne non necessarie
    columns_to_drop = ['compressor_speed','net_power','indicated_air_speed','power_avail']  # Aggiungi qui altre colonne da rimuovere
    df = df.drop(columns=columns_to_drop, errors='ignore')

    # Assicurati che 'y_target' sia l'ultima colonna
    if 'y_target' in df.columns:
        cols = [col for col in df.columns if col != 'y_target'] + ['y_target']
        df = df[cols]
    else:
        print("Warning: 'y_target' column not found in DataFrame. It will not be moved to the end.")

    # Assicurati che il DataFrame non abbia colonne duplicate
    df = df.loc[:, ~df.columns.duplicated()]

    # Assicurati che il DataFrame non abbia valori NaN
    if df.isnull().values.any():
        print("Warning: DataFrame contains NaN values. They will be filled with 0.")
        df = df.fillna(0)

    # Assicurati che il DataFrame non abbia valori infiniti
    if np.isinf(df.values).any():
        print("Warning: DataFrame contains infinite values. They will be replaced with 0.")
        df.replace([np.inf, -np.inf], 0, inplace=True)

    # Standardizzazione
    if standardize:
        target = 'y_target'
        numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
        columns_to_standardize = [col for col in numerical_cols if col != target]
        df = standardize_columns(df, columns_to_standardize)

    return df



### Esempio di utilizzo della pipeline
x_path = 'dataset/X_data.csv'
y_path = 'dataset/y_data.csv'

# Configurazioni opzionali
new_column_names = ['idx', 'torque_meas', 'outside_air_temp', 'mean_gas_temp',
                    'power_avail', 'indicated_air_speed', 'net_power', 'compressor_speed']

root_transform = {'power_avail': 2.35}
binning = {'indicated_air_speed': (5, 'quantile')}

data_ready = prepare_data_pipeline(
    x_path, y_path,
    new_column_names=new_column_names,
    root_transformations=root_transform,
    binning_config=binning
)

# Esempio di stampa del DataFrame preprocessato
print(data_ready.head())
print(data_ready.describe())

n_cols = 3
n_rows = (len(data_ready.columns) + n_cols - 1) // n_cols
plt.figure(figsize=(15, n_rows * 4))
for i, col in enumerate(data_ready):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(data_ready[col], bins=50, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig("grafico.png")

# data_ready ora è pronto per essere usato in un modello

NameError: name 'load_training_data' is not defined

## Radice custom

### Trasformazione in work_with_data

In [14]:
def transform_with_custom_root(df, column_name, root_degree):
  """
  Applies a custom root transformation (1/root_degree power) to a column.
  Handles positive, negative, and zero values appropriately based on the root degree.

  Args:
    df (pd.DataFrame): The input DataFrame.
    column_name (str): The name of the column to transform.
    root_degree (float): The degree of the root (e.g., 2 for square root, 3 for cube root).

  Returns:
    pd.DataFrame: The DataFrame with the transformed column.
  """
  new_column_name = f'{column_name}_root_{root_degree:.2f}_transformed'

  if root_degree == 0:
      raise ValueError("Root degree cannot be zero.")
  elif root_degree % 2 == 0:  # Even root
      # For even roots, we can only take the root of non-negative numbers
      if (df[column_name] < 0).any():
          print(f"Warning: Column '{column_name}' contains negative values. Cannot apply even root directly.")
          # You might choose to handle this by taking the root of the absolute value,
          # or setting negative values to NaN, depending on your data context.
          # Here, we'll take the root of the absolute value for demonstration.
          df[new_column_name] = np.power(np.abs(df[column_name]), 1/root_degree)
      else:
          df[new_column_name] = np.power(df[column_name], 1/root_degree)
  else:  # Odd root
      # Odd roots can handle positive, negative, and zero values
      df[new_column_name] = np.sign(df[column_name]) * np.power(np.abs(df[column_name]), 1/root_degree)

  return df

### Trasformazione pipeline

In [8]:
# prompt: crea una classe che sia compatibile con l'oggetto pipeline di scikit che contenga la funzione transform_with_custom_root

class CustomRootTransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer to apply a custom root transformation to specified columns.
    Compatible with scikit-learn pipelines.

    Args:
        root_transformations (dict): A dictionary where keys are column names
                                     and values are the root degrees to apply.
    """
    def __init__(self, root_transformations=None):
        self.root_transformations = root_transformations

    def fit(self, X, y=None):
        """
        Fits the transformer. In this case, there's nothing to fit.

        Args:
            X: Input data (ignored).
            y: Target data (ignored).

        Returns:
            self: The fitted transformer instance.
        """
        return self

    def transform(self, X):
        """
        Applies the custom root transformation to the specified columns in the DataFrame.

        Args:
            X (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with the transformed columns.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        df_transformed = X.copy() # Work on a copy to avoid modifying the original DataFrame

        if self.root_transformations:
            for col, deg in self.root_transformations.items():
                if col not in df_transformed.columns:
                    print(f"Warning: Column '{col}' not found in DataFrame. Skipping root transformation.")
                    continue

                print(f"Applying custom root {deg} transformation to column '{col}'...")
                df_transformed = transform_with_custom_root(df_transformed, col, deg)

        return df_transformed



## Binary transformation

### Trasformazione work_with_data

In [9]:
def create_binned_qualitative_variable(df, column_name, num_bins, strategy='quantile'):
  """
  Creates a qualitative (categorical) variable by binning a numerical column.

  Args:
    df (pd.DataFrame): The input DataFrame.
    column_name (str): The name of the numerical column to bin.
    num_bins (int): The desired number of bins.
    strategy (str): The strategy to use for binning. 'quantile' uses quantiles
                    to ensure bins have approximately equal numbers of observations.
                    'uniform' creates bins with equal widths. Default is 'quantile'.

  Returns:
    pd.DataFrame: The DataFrame with a new qualitative column.
                  The new column name will be f'{column_name}_binned_{num_bins}_{strategy}'.
  """
  if column_name not in df.columns:
    raise ValueError(f"La colonna '{column_name}' non è presente nel DataFrame.")
  if num_bins <= 1:
      raise ValueError("Il numero di bins deve essere maggiore di 1.")

  new_column_name = f'{column_name}_binned_{num_bins}_{strategy}'

  if strategy == 'quantile':
    # Use qcut to create bins based on quantiles (approximately equal number of observations)
    # `duplicates='drop'` handles cases where quantile boundaries are not unique,
    # which can happen with skewed or discrete data.
    df[new_column_name] = pd.qcut(df[column_name], q=num_bins, labels=False, duplicates='drop')
  elif strategy == 'uniform':
    # Use cut to create bins of equal width
    df[new_column_name] = pd.cut(df[column_name], bins=num_bins, labels=False, include_lowest=True)
  else:
    raise ValueError(f"Strategia di binning non valida: '{strategy}'. Scegliere tra 'quantile' o 'uniform'.")

  # Convert the binned column to object/category type if needed, or keep as int for simplicity
  # Here we keep it as int representing the bin number

  return df

### Trasformazione pipeline

In [10]:
# prompt: crea una classe che sia compatibile con l'oggetto pipeline di scikit che contenga la funzione  create_binned_qualitative_variable

class BinnedQualitativeTransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer to create binned qualitative (categorical) variables
    from numerical columns. Compatible with scikit-learn pipelines.

    Args:
        binning_config (dict): A dictionary where keys are column names
                               and values are tuples (num_bins, strategy).
                               Strategy can be 'quantile' or 'uniform'.
    """
    def __init__(self, binning_config=None):
        self.binning_config = binning_config

    def fit(self, X, y=None):
        """
        Fits the transformer. In this case, there's nothing to fit.

        Args:
            X: Input data (ignored).
            y: Target data (ignored).

        Returns:
            self: The fitted transformer instance.
        """
        return self

    def transform(self, X):
        """
        Applies the binning transformation to the specified columns in the DataFrame.

        Args:
            X (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with the new binned qualitative columns.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        df_transformed = X.copy() # Work on a copy to avoid modifying the original DataFrame

        if self.binning_config:
            for col, (num_bins, strategy) in self.binning_config.items():
                if col not in df_transformed.columns:
                    print(f"Warning: Column '{col}' not found in DataFrame. Skipping binning transformation.")
                    continue

                print(f"Applying binning transformation to column '{col}' with {num_bins} bins and strategy '{strategy}'...")
                df_transformed = create_binned_qualitative_variable(df_transformed, col, num_bins, strategy)

        return df_transformed


## Compressor speed net power

### Trasformazione con work_with_data

In [None]:
# Select the columns for PCA
features_for_pca = data_train[['compressor_speed', 'net_power']]

# Initialize PCA with 1 component (to combine the two variables)
pca = PCA(n_components=1)

# Fit PCA on the selected features and transform them
data_train['compressor_speed_net_power_pca'] = pca.fit_transform(features_for_pca)

### Trasformazione pipeline

In [13]:
# prompt: crea una classe che sia compatibile con l'oggetto pipeline di scikit che  come nella cella sopra a partire dalle feature 'compressor_speed' e net_power' tramite il metodo PCA crei una nuova variabile 'compressor_speed_net_power_pca'

class PCATransformer(BaseEstimator, TransformerMixin):
    """
    A custom transformer to apply PCA on specified features.
    Compatible with scikit-learn pipelines.

    Args:
        features (list): A list of column names to apply PCA on.
        n_components (int or float or 'mle'): The number of components to keep.
                                              Refer to sklearn.decomposition.PCA documentation.
        new_column_name (str): The name for the new PCA component column.
    """
    def __init__(self, features, n_components=1, new_column_name='pca_component'):
        if not isinstance(features, list) or len(features) < 2:
            raise ValueError("Features must be a list of at least two column names.")
        self.features = features
        self.n_components = n_components
        self.new_column_name = new_column_name
        self.pca_ = PCA(n_components=self.n_components) # Initialize PCA model

    def fit(self, X, y=None):
        """
        Fits the PCA model on the specified features of the input data.

        Args:
            X (pd.DataFrame): The input DataFrame containing the features.
            y: Target data (ignored).

        Returns:
            self: The fitted transformer instance.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        # Check if all specified features exist in the DataFrame
        if not set(self.features).issubset(X.columns):
            missing_features = list(set(self.features) - set(X.columns))
            raise ValueError(f"Missing features in DataFrame: {missing_features}")

        print(f"Fitting PCA on features: {self.features} with {self.n_components} components...")
        # Fit the PCA model on the selected columns
        self.pca_.fit(X[self.features])
        print("PCA fitting complete.")

        return self

    def transform(self, X):
        """
        Applies the fitted PCA transformation to the specified features and adds
        the new component(s) to the DataFrame.

        Args:
            X (pd.DataFrame): The input DataFrame containing the features.

        Returns:
            pd.DataFrame: The DataFrame with the new PCA component column(s).
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        # Check if all specified features exist in the DataFrame
        if not set(self.features).issubset(X.columns):
            missing_features = list(set(self.features) - set(X.columns))
            raise ValueError(f"Missing features in DataFrame: {missing_features}")

        print(f"Transforming data using fitted PCA on features: {self.features}...")
        # Transform the selected columns using the fitted PCA model
        pca_components = self.pca_.transform(X[self.features])

        # Create a DataFrame for the PCA components
        if self.n_components == 1:
            pca_df = pd.DataFrame(pca_components, index=X.index, columns=[self.new_column_name])
        else:
            # If multiple components, name them accordingly
            component_names = [f'{self.new_column_name}_{i+1}' for i in range(pca_components.shape[1])]
            pca_df = pd.DataFrame(pca_components, index=X.index, columns=component_names)

        # Concatenate the original DataFrame (excluding the original features used for PCA)
        # with the new PCA component DataFrame.
        # We drop the original features from the input DataFrame X before concatenating
        # to avoid redundancy, assuming the PCA components replace them conceptually.
        # If you want to keep the original features, remove the .drop(self.features, axis=1) part.
        df_transformed = pd.concat([X.drop(self.features, axis=1), pca_df], axis=1)
        print("PCA transformation complete.")

        return df_transformed
