<a href="https://colab.research.google.com/github/SMJ2003-pocketware/DATA-PIPELINE-DEVELOPMENT/blob/main/Task_1_Data_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing the required libraries and dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

In [11]:
# Step 1: Loading the Data
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [25, 30, None, 35, 40],
    "Salary": [50000, 60000, 55000, None, 70000],
    "City": ["New York", "Los Angeles", "New York", "Chicago", "Los Angeles"],
    "Gender": ["F", "M", "M", "M", "F"]
}
df = pd.DataFrame(data)
print("Raw Data:")
print(df)

Raw Data:
      Name   Age   Salary         City Gender
0    Alice  25.0  50000.0     New York      F
1      Bob  30.0  60000.0  Los Angeles      M
2  Charlie   NaN  55000.0     New York      M
3    David  35.0      NaN      Chicago      M
4      Eve  40.0  70000.0  Los Angeles      F


In [10]:
# Step 2: Preprocessing
numeric_features = ["Age", "Salary"]
categorical_features = ["City", "Gender"]

In [12]:
# Define transformers
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


In [5]:
# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [6]:
# Step 3: Define Pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor)
])


In [7]:
# Step 4: Apply Pipeline
processed_data = pipeline.fit_transform(df)

# Convert transformed data back to a DataFrame
transformed_columns = numeric_features + list(pipeline.named_steps["preprocessor"].transformers_[1][1]["onehot"].get_feature_names_out(categorical_features))
processed_df = pd.DataFrame(processed_data, columns=transformed_columns)

print("\nProcessed Data:")
print(processed_df)


Processed Data:
   Age    Salary  City_Chicago  City_Los Angeles  City_New York  Gender_F  \
0 -1.5 -1.322876           0.0               0.0            1.0       1.0   
1 -0.5  0.188982           0.0               1.0            0.0       0.0   
2  0.0 -0.566947           0.0               0.0            1.0       0.0   
3  0.5  0.000000           1.0               0.0            0.0       0.0   
4  1.5  1.700840           0.0               1.0            0.0       1.0   

   Gender_M  
0       0.0  
1       1.0  
2       1.0  
3       1.0  
4       0.0  


In [8]:
# Step 5: Save the processed data
output_file = "processed_data.csv"
processed_df.to_csv(output_file, index=False)
print(f"\nProcessed data saved to {os.path.abspath(output_file)}")


Processed data saved to /content/processed_data.csv


In [9]:
from google.colab import files
files.download('processed_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>