In [1]:
!pip install pandas scikit-learn joblib



In [3]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from datetime import datetime

In [4]:
def make_demo_dataframe():
    rng = np.random.default_rng(42)
    n = 20
    df = pd.DataFrame({
        "name": ["Alice","Bob","Charlie","David","Eva"] * 4,
        "age": rng.integers(18, 60, size=n),
        "city": rng.choice(["Vijayawada","Chennai","Hyderabad","Delhi"], size=n),
        "income": rng.normal(50000, 10000, size=n).round(2),
        "target": rng.choice([0,1], size=n, p=[0.6, 0.4])
    })
    # Add missing values
    df.loc[rng.choice(df.index, size=3, replace=False), "age"] = np.nan
    df.loc[rng.choice(df.index, size=2, replace=False), "city"] = np.nan
    return df

In [5]:
# 1) Extract — get data
df = make_demo_dataframe()

# 2) Separate features and target
target_col = "target"
y = df[target_col]
X = df.drop(columns=[target_col])

# 3) Identify column types
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

# 4) Define preprocessing for each type
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# 5) Combine into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# 6) Transform data
X_processed = preprocessor.fit_transform(X)

# 7) Save processed data
os.makedirs("outputs/task1", exist_ok=True)
processed_df = pd.DataFrame(
    X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed
)
processed_df.to_csv("outputs/task1/processed.csv", index=False)
joblib.dump(preprocessor, "outputs/task1/preprocess.joblib")

# 8) Create report
report = [
    f"Run: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
    f"Rows: {len(df)}",
    f"Numeric cols: {numeric_cols}",
    f"Categorical cols: {categorical_cols}",
    f"Output: outputs/task1/processed.csv"
]
with open("outputs/task1/report.txt", "w") as f:
    f.write("\n".join(report))

print("✅ ETL Pipeline Completed!")
print("Processed data saved in 'outputs/task1' folder")


✅ ETL Pipeline Completed!
Processed data saved in 'outputs/task1' folder


In [6]:
from google.colab import files
files.download("outputs/task1/processed.csv")
files.download("outputs/task1/report.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>