# **Milestone 2: Data Ingestion Pipeline**

**Objective:**
Build of a daily data ingestion workflow.

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import os

INPUT_FILE = "dynamic_pricing.csv"
OUTPUT_FILE = "cleaned_csv_data.csv"

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    print("Data loaded. Shape:", df.shape)
    return df

def clean_data(df):
    numeric_cols = df.select_dtypes(include="number").columns
    cat_cols = df.select_dtypes(include="object").columns

    df = df.dropna()

    for col in numeric_cols:
        Q1, Q3 = df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        df = df[~((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR)))]

    df = df.drop_duplicates()

    for col in cat_cols:
        df[col] = LabelEncoder().fit_transform(df[col])

    print("Data cleaned successfully")
    print("Final shape after cleaning:", df.shape)
    return df

def save_data(df, output_file):
    df.to_csv(output_file, index=False)
    print(f"Cleaned data saved to: {output_file}")

def run_pipeline():
    df_raw = load_data(INPUT_FILE)
    df_cleaned = clean_data(df_raw)
    save_data(df_cleaned, OUTPUT_FILE)
    print("Pipeline completed successfully.")
    return df_cleaned

if __name__ == "__main__":
    df_cleaned = run_pipeline()

Data loaded. Shape: (1000, 10)
Data cleaned successfully
Final shape after cleaning: (990, 10)
Cleaned data saved to: cleaned_csv_data.csv
Pipeline completed successfully.


In [3]:
#KPIs for Revenue lift, Profit Margin, ConversionÂ Rate, reflecting the outcome of the dynamic pricing model 
df = pd.read_csv("cleaned_csv_data.csv")

df['revenue'] = df['Historical_Cost_of_Ride']

df['profit'] = df['revenue'] * 0.3
df['profit_margin'] = (df['profit'] / df['revenue']) * 100

df['conversion_rate'] = (df['Customer_Loyalty_Status'] / df['Customer_Loyalty_Status'].max()) * 100

baseline_revenue = df['revenue'].iloc[0]
df['revenue_lift_pct'] = ((df['revenue'] - baseline_revenue) / baseline_revenue) * 100

kpi_summary = df[['revenue', 'profit_margin', 'conversion_rate', 'revenue_lift_pct']]
print(kpi_summary.head())

      revenue  profit_margin  conversion_rate  revenue_lift_pct
0  284.257273           30.0            100.0          0.000000
1  173.874753           30.0            100.0        -38.831907
2  329.795469           30.0            100.0         16.020064
3  470.201232           30.0             50.0         65.413967
4  579.681422           30.0             50.0        103.928440
