<a href="https://colab.research.google.com/github/TizianoCosta/AML_2425_FinalProject/blob/main/notebook/regressione_logistica_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook per regressione lineare

Di seguito:
- verrà implementato l'algoritmo di regressione logistica

## Scaricamento dei dati

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import requests
import zipfile
import io
import os
import glob
import pandas as pd

class DataDownloaderExtractor(BaseEstimator, TransformerMixin):
    """
    A custom transformer to download and extract data from given URLs.

    Args:
        urls (list): A list of URLs to zip files.
        output_dir (str): The directory to save the extracted files.
    """
    def __init__(self, urls, output_dir="dataset"):
        self.urls = urls
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def fit(self, X, y=None):
        """
        Fits the transformer. In this case, it's a no-op as there's nothing to fit.

        Args:
            X: Input data (ignored).
            y: Target data (ignored).

        Returns:
            self: The fitted transformer instance.
        """
        return self

    def transform(self, X):
        """
        Downloads and extracts data from the provided URLs.

        Args:
            X: Input data (ignored).

        Returns:
            list: A list of paths to the extracted CSV files.
        """
        for url in self.urls:
            try:
                print(f"Downloading {url}...")
                response = requests.get(url, stream=True)
                response.raise_for_status() # Raise an exception for bad status codes

                # Read the zip file from the response content
                with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
                    # Extract all contents to the specified output directory
                    zip_ref.extractall(self.output_dir)
                    print(f"Extracted files from {url} to {self.output_dir}")

            except requests.exceptions.RequestException as e:
                print(f"Error downloading {url}: {e}")
            except zipfile.BadZipFile:
                print(f"Error: The downloaded file from {url} is not a valid zip file.")
            except Exception as e:
                print(f"An unexpected error occurred: {e}")

        print("Download and extraction complete.")
        csv_files = glob.glob(os.path.join(self.output_dir, "*.csv"))
        print("CSV files found:", csv_files)
        return csv_files

# Example usage with a scikit-learn pipeline
from sklearn.pipeline import Pipeline

# List of URLs to your zipped files on AWS
urls = [
    "https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip",
]

# Create the custom transformer
downloader_extractor = DataDownloaderExtractor(urls=urls, output_dir="downloaded_data")

# You can add other steps to the pipeline if needed, e.g., a data loader
# For this example, we just have the download and extract step

pipeline = Pipeline([
    ('download_and_extract', downloader_extractor),
    # Add more steps here if needed, e.g., loading the CSV files into pandas DataFrames
])

# Run the pipeline
# The fit method is called first (though it does nothing in this transformer)
# Then the transform method is called to perform the download and extraction
extracted_files = pipeline.fit_transform(None) # Pass None as input data, as it's not used

print("\nPipeline execution complete.")
print("Extracted files:", extracted_files)


Downloading https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip...
Extracted files from https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip to downloaded_data
Download and extraction complete.
CSV files found: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']

Pipeline execution complete.
Extracted files: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']


## Estrazione dei dati

In [None]:
class CSVLoader(BaseEstimator, TransformerMixin):
    """
    A custom transformer to load specific CSV files into pandas DataFrames
    and return them in a format suitable for scikit-learn pipelines (e.g., as a tuple).

    Assumes that the input X is a list of file paths, typically produced
    by a previous step in the pipeline.

    Args:
        x_filename (str): The base name of the file containing features (e.g., 'X_train.csv').
        y_filename (str): The base name of the file containing the target (e.g., 'y_train.csv').
    """
    def __init__(self, x_filename='X_train.csv', y_filename='y_train.csv'):
        self.x_filename = x_filename
        self.y_filename = y_filename
        self.x_data = None
        self.y_data = None

    def fit(self, X, y=None):
        """
        Fits the transformer. This method will load the data.

        Args:
            X: A list of file paths (expected to contain x_filename and y_filename).
            y: Target data (ignored).

        Returns:
            self: The fitted transformer instance.
        """
        x_file_path = None
        y_file_path = None

        # Find the correct file paths in the input list
        for file_path in X:
            if os.path.basename(file_path) == self.x_filename:
                x_file_path = file_path
            elif os.path.basename(file_path) == self.y_filename:
                y_file_path = file_path

        if x_file_path is None:
            raise FileNotFoundError(f"Could not find {self.x_filename} in the provided file list.")
        if y_file_path is None:
             raise FileNotFoundError(f"Could not find {self.y_filename} in the provided file list.")

        try:
            print(f"Loading {x_file_path} into x_data...")
            self.x_data = pd.read_csv(x_file_path)
            print(f"Loading {y_file_path} into y_data...")
            self.y_data = pd.read_csv(y_file_path) # Or read_fwf depending on the format
            print("Data loading complete.")

        except FileNotFoundError as e:
            print(f"Error loading file: {e}")
            # You might want to re-raise the exception or handle it differently
            raise
        except pd.errors.EmptyDataError:
            print(f"Error: One of the files ({self.x_filename} or {self.y_filename}) is empty.")
            raise
        except Exception as e:
            print(f"An unexpected error occurred while loading data: {e}")
            raise

        return self

    def transform(self, X):
        """
        Returns the loaded data (x_data, y_data).

        Args:
            X: Input data (ignored, data is loaded in fit).

        Returns:
            tuple: A tuple containing (x_data, y_data) as pandas DataFrames.
        """
        if self.x_data is None or self.y_data is None:
             raise RuntimeError("Data has not been loaded yet. Call fit() first.")

        # Return the data in a format that can be passed to the next pipeline step
        # For scikit-learn estimators, the fit method usually expects X and y separately.
        # Returning a tuple (X, y) allows the next step's fit method to receive them.
        return (self.x_data, self.y_data)

# Add the CSVLoader to the pipeline
pipeline_with_loading = Pipeline([
    ('download_and_extract', downloader_extractor),
    ('load_csv', CSVLoader(x_filename='X_train.csv', y_filename='y_train.csv'))
    # Add more steps here, e.g., preprocessing, model training
])

# Run the pipeline
# The output of the 'load_csv' step will be a tuple (x_data, y_data)
loaded_data = pipeline_with_loading.fit_transform(None)

# Access the loaded data
x_data, y_data = loaded_data

print("\nLoaded x_data shape:", x_data.shape)
print("Loaded y_data shape:", y_data.shape)
print("\nFirst 5 rows of x_data:")
print(x_data.head())
print("\nFirst 5 rows of y_data:")
print(y_data.head())

Downloading https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip...
Extracted files from https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip to downloaded_data
Download and extraction complete.
CSV files found: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']
Loading downloaded_data/X_train.csv into x_data...
Loading downloaded_data/y_train.csv into y_data...
Data loading complete.

Loaded x_data shape: (742625, 8)
Loaded y_data shape: (742625, 3)

First 5 rows of x_data:
   id  trq_measured       oat       mgt         pa       ias         np  \
0   0        54.100   2.00000  544.5000   212.1408  74.56250   89.18000   
1   1        49.625  24.22231  578.4844  1625.6400  30.35596   99.55273   
2   2        52.000   7.00000  566.1000  1912.9250  65.62500  100.14000   
3   3        62.400   7.25000  560.1000   277.0632  54.81250   90.64000   
4   4        62.900  23.25000  593.7000    53.6448  73.43750   99.91000   

     

## Creazione train-set e test-set

In [None]:
from sklearn.model_selection import train_test_split

class DataPreprocessor(BaseEstimator, TransformerMixin):
    """
    A custom transformer to preprocess the data:
    - Drop the 'id' column from x_data.
    - Merge the 'faulty' column from y_data into x_data.
    - Split the merged data into training and testing sets.

    Assumes the input is a tuple (x_data, y_data) as pandas DataFrames,
    typically from a previous pipeline step like CSVLoader.
    """
    def __init__(self, test_size=0.2, random_state=None):
        self.test_size = test_size
        self.random_state = random_state
        self.data_train = None
        self.data_test = None

    def fit(self, X, y=None):
        """
        Fits the transformer by preprocessing and splitting the data.

        Args:
            X: A tuple (x_data, y_data) where x_data is the features DataFrame
               and y_data is the target DataFrame.
            y: Target data (ignored, as the target is expected in y_data).

        Returns:
            self: The fitted transformer instance.
        """
        if not isinstance(X, tuple) or len(X) != 2:
            raise TypeError("Input X must be a tuple (x_data, y_data).")

        x_data, y_data = X

        if not isinstance(x_data, pd.DataFrame) or not isinstance(y_data, pd.DataFrame):
             raise TypeError("Both elements in the input tuple must be pandas DataFrames.")

        # Drop the 'id' column from x_data if it exists
        if 'id' in x_data.columns:
            print("Dropping 'id' column from x_data...")
            x_data_processed = x_data.drop('id', axis=1)
        else:
            print("'id' column not found in x_data. Skipping drop.")
            x_data_processed = x_data.copy()

        # Check if 'faulty' column exists in y_data and merge it
        if 'faulty' in y_data.columns:
            print("Merging 'faulty' column from y_data into x_data...")
            # Ensure dataframes can be merged, e.g., they have a common index or column
            # Assuming they can be concatenated side-by-side based on index
            # If merging by a specific column is needed, adjust here
            merged_data = pd.concat([x_data_processed, y_data['faulty']], axis=1)
        else:
             raise ValueError("'faulty' column not found in y_data.")

        print(f"Splitting data into train ({1-self.test_size:.0%}) and test ({self.test_size:.0%})...")
        # Split the merged data into training and testing sets
        self.data_train, self.data_test = train_test_split(
            merged_data,
            test_size=self.test_size,
            random_state=self.random_state
        )
        print("Data splitting complete.")

        return self

    def transform(self, X):
        """
        Returns the split training and testing data.

        Args:
            X: Input data (ignored, splitting is done in fit).

        Returns:
            tuple: A tuple containing (data_train, data_test) as pandas DataFrames.
        """
        if self.data_train is None or self.data_test is None:
             raise RuntimeError("Data has not been preprocessed or split yet. Call fit() first.")

        # Return the split data
        return (self.data_train, self.data_test)

# Extend the existing pipeline to include the DataPreprocessor
pipeline_with_preprocessing = Pipeline([
    ('download_and_extract', downloader_extractor),
    ('load_csv', CSVLoader(x_filename='X_train.csv', y_filename='y_train.csv')),
    ('preprocess_and_split', DataPreprocessor(test_size=0.2, random_state=42)) # Add the preprocessor
    # Add more steps here, e.g., feature scaling, model training
])

# Run the pipeline
# The output of the 'preprocess_and_split' step will be a tuple (data_train, data_test)
split_data = pipeline_with_preprocessing.fit_transform(None)

# Access the split data
data_train, data_test = split_data

print("\nProcessed and Split Data:")
print("data_train shape:", data_train.shape)
print("data_test shape:", data_test.shape)


Downloading https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip...
Extracted files from https://phm-datasets.s3.amazonaws.com/Data_Challenge_PHM2024_training_data.zip to downloaded_data
Download and extraction complete.
CSV files found: ['downloaded_data/X_train.csv', 'downloaded_data/y_train.csv']
Loading downloaded_data/X_train.csv into x_data...
Loading downloaded_data/y_train.csv into y_data...
Data loading complete.
Dropping 'id' column from x_data...
Merging 'faulty' column from y_data into x_data...
Splitting data into train (80%) and test (20%)...
Data splitting complete.

Processed and Split Data:
data_train shape: (594100, 8)
data_test shape: (148525, 8)
