# flow = True 


In [7]:
# Importing necessary modules
# - pandas: used for DataFrame operations.
# - dill: used for saving and loading Python objects, including pipelines, to and from files.
from ScoringPy import Processing
import pandas as pd
import dill

# Sample dataset containing information about individuals:
# The 'Age' column has missing (None) values that need to be handled.
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],  # Names of the individuals
    'Age': [10, None, 20, None],  # Age of individuals; some values are missing (None)
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']  # City of residence
}

# Converting the dictionary to a pandas DataFrame for easier manipulation of tabular data.
df = pd.DataFrame(data)

# Initializing the processing pipeline using the Processing class from the imported module.
# Setting 'flow=True' means that the output from each function (step) will be passed as input to the next function.
# This allows transformations to be applied sequentially, one step after another, without needing manual intervention.
# The flow won't run immediately on initialization, allowing flexibility to add steps first.
pipeline = Processing(flow=True)

# Function that defines step 1 of the pipeline
# This step fills the missing 'Age' values with the mean of the 'Age' column.
# Args:
# - data (DataFrame): The input DataFrame where 'Age' column has missing values.
# Returns:
# - DataFrame: The modified DataFrame where 'Age' has been filled with mean values.
def step_1(data):
    data['Age'] = data['Age'].fillna(data['Age'].mean())  # Filling missing 'Age' values with the mean of the 'Age' column.
    return data

# Function that defines step 2 of the pipeline
# This step multiplies the 'Age' column values by 2.
# Args:
# - data (DataFrame): The input DataFrame from step 1, with no missing 'Age' values.
# Returns:
# - DataFrame: The modified DataFrame where 'Age' values are multiplied by 2.
def step_2(data):
    data['Age'] = data['Age'] * 2  # Doubling the 'Age' values.
    return data

# Function that defines step 3 of the pipeline
# This step divides the 'Age' column values by 5.
# Args:
# - data (DataFrame): The input DataFrame from step 2, with doubled 'Age' values.
# Returns:
# - DataFrame: The modified DataFrame where 'Age' values are divided by 5.
def step_3(data):
    data['Age'] = data['Age'] / 5  # Dividing the 'Age' values by 5.
    return data

# Adding step 1 to the processing pipeline.
# The pipeline will now execute step_1 as the first transformation when run.
pipeline.add_step(step_1)

# Adding step 2 to the processing pipeline.
# The pipeline will execute step_2 after step_1 when the pipeline is run.
pipeline.add_step(step_2)

# Adding step 3 to the processing pipeline.
# The pipeline will execute step_3 after step_2 when the pipeline is run.
pipeline.add_step(step_3)

# Saving the configured pipeline (with the steps added) to a file.
# This allows for reusability of the pipeline without reconfiguring it each time.
# The file is saved using the 'dill' library, which can serialize complex Python objects like classes and functions.
with open('Pipeline.pkl', 'wb') as file:
    dill.dump(pipeline, file)

# Running the pipeline on the initial data (the 'df' DataFrame created earlier).
# The pipeline will execute each step in sequence, as per the flow mechanism:
# 1. Fill missing 'Age' values with the mean.
# 2. Multiply 'Age' by 2.
# 3. Divide 'Age' by 5.
df = pipeline.run(initial_data=df)

# The final DataFrame 'df' will have the following transformations:
# - Missing values in 'Age' are replaced by the mean of the 'Age' column.
# - The 'Age' values are multiplied by 2, and then divided by 5.


In [8]:
# Step 2: Loading the pipeline for Filling Data
# This section loads a previously saved pipeline from a file using the 'dill' library.
# The file 'Pipeline.pkl' contains the serialized version of the pipeline that was configured and saved earlier.

with open('Pipeline.pkl', 'rb') as file:
    pipeline = dill.load(file)  # Loading the previously saved pipeline from the file.

# The 'pipeline' object is now restored with all the previously added steps (step_1, step_2, and step_3).
# These steps will be executed in sequence when the pipeline is run with the new input data.

# Executing the pipeline with the DataFrame 'df', which was produced as a result of the first pipeline execution.
# The input DataFrame 'df' contains transformations done by the earlier pipeline execution.
# Here, we pass it through the same pipeline to reapply the same transformations, or to continue processing based on current data.

df = pipeline.run(initial_data=df)  # Running the pipeline on the current DataFrame.

# Clearing the pipeline
# The pipeline object is cleared, meaning all the added steps will be removed.
# This can be useful if you want to reset the pipeline and add new steps for different transformations.
pipeline.clear()

# At this point, the 'pipeline' object is empty, and no steps are available for execution until new steps are added.


Pipeline data cleared to free up memory.


# flow = False

In [9]:
# Importing necessary modules
# - pandas: for working with DataFrame operations and reading Excel files.
# - dill: for saving and loading Python objects, including the pipeline, to and from files.
from ScoringPy import Processing
import pandas as pd
import dill

# The file path for the first Excel file.
row_path = 'Data/step1.xlsx'

# Initializing the processing pipeline without running the flow immediately.
# Setting flow=False means that the steps in the pipeline will not automatically pass data between them.
# You will control the flow manually and decide when to pass data to subsequent steps.
pipeline = Processing(flow=False)

# Step 1: Load data from an Excel file (provided as 'path').
# Args:
# - path (str): The file path to read from.
# Returns:
# - DataFrame: The data read from the Excel file.
def step_1(path=None):
    data = pd.read_excel(path)  # Reading the data from the specified Excel file.
    return data

# Step 2: Load a different dataset from another Excel file ('Data/step2.xlsx').
# No arguments are needed here, as the file path is hardcoded.
# Returns:
# - DataFrame: The data read from 'step2.xlsx'.
def step_2():
    data = pd.read_excel('Data/step2.xlsx')  # Reading data from 'step2.xlsx'.
    return data

# Step 3: Concatenate the results from step 1 and step 2.
# It retrieves the data from the context (stored results from previous steps).
# The context acts as a state where data from each step can be accessed by subsequent steps.
# Returns:
# - DataFrame: The concatenated DataFrame where 'Age' values are multiplied by 2.
def step_3():
    step_1_data = pipeline.context.get('step_1')  # Retrieving the result of step 1 from the pipeline context.
    step_2_data = pipeline.context.get('step_2')  # Retrieving the result of step 2 from the pipeline context.

    # Concatenating the two DataFrames from step 1 and step 2.
    data = pd.concat([step_1_data, step_2_data], ignore_index=True)

    data['Age'] = data['Age'] * 2  # Doubling the 'Age' values.
    return data

# Step 4: Further transformation of the data by dividing the 'Age' column by 5.
# Args:
# - data (DataFrame): The input DataFrame from step 3.
# Returns:
# - DataFrame: The modified DataFrame where 'Age' values are divided by 5.
def step_4(data):
    data['Age'] = data['Age'] / 5  # Dividing the 'Age' values by 5.
    return data


# Adding the steps to the pipeline
# Each step is added sequentially, and the corresponding function is passed as an argument.
# If the function requires parameters (e.g., step 1), the parameters are provided when adding the step.

# Step 1: Reading data from the Excel file located at 'row_path'.
pipeline.add_step(step_1, row_path)

# Step 2: Reading data from 'Data/step2.xlsx' file.
pipeline.add_step(step_2)

# Step 3: Concatenating the data from steps 1 and 2.
# flow=True ensures that the output from step 3 will automatically be passed to step 4.
pipeline.add_step(step_3, flow=True)

# Step 4: Applying further transformations to the data, with flow=True.
# Here, flow=True ensures that the output from step 3 will be passed to step 4 automatically.
pipeline.add_step(step_4, flow=True)

# Saving the configured pipeline to a file for reuse in the future.
# The 'dill' library is used to serialize the pipeline object.
with open('Pipeline.pkl', 'wb') as file:
    dill.dump(pipeline, file)

# Running the pipeline
# The pipeline is executed, and it flows through all the steps in sequence:
# Step 1 (data loading), Step 2 (additional data loading), Step 3 (data concatenation and transformation),
# and Step 4 (final transformation). The output from each step is automatically passed to the next step when flow=True.
df = pipeline.run()

# The resulting DataFrame 'df' will contain:
# - Data loaded from two Excel files (concatenated).
# - 'Age' values that were first multiplied by 2 in step 3 and then divided by 5 in step 4.


In [10]:
# Step 2: Loading the pipeline for processing data.
# The 'Pipeline.pkl' file contains the saved pipeline object, which was serialized using the 'dill' library.
# This pipeline has multiple steps that were configured earlier for data transformation.
# We load the pipeline to reuse it without needing to redefine or reconfigure it.

with open('Pipeline.pkl', 'rb') as file:
    pipeline = dill.load(file)  # Loading the previously saved pipeline from the file using 'dill'.

# After loading, the pipeline is restored with all the previously added steps.
# These steps can now be executed in sequence as they were originally configured.

# Executing the pipeline.
# The pipeline's 'run()' method is called to execute all the steps in the pipeline.
# Since the pipeline was configured with 'flow=True' for certain steps, data will automatically be passed from one step to the next.
# The steps will process the data according to the logic defined in each function (e.g., data loading, concatenation, transformation).
df = pipeline.run()

# At this point, the 'df' DataFrame will contain the final result after all transformations are applied by the pipeline.

# Clearing the pipeline.
# The 'clear()' method removes all the steps and clears the context of the pipeline.
# This is useful if you want to reset the pipeline to a clean state before re-adding steps or reconfiguring it.
pipeline.clear()

# After clearing, the pipeline is now empty and ready for new steps if needed.


Pipeline data cleared to free up memory.
