In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [10]:


# Footer for branding (as per brand rule)
FOOTER = "Model by sadia"

In [16]:

# Step 1: Load data (customized for your dataset)
def load_data():
    data = pd.read_csv("E:\\dataset\\Air_Quality.csv")  # Update if filename is different
    print("Data loaded successfully.")
    return data

In [17]:

# Step 2: Define preprocessing pipeline
# Missing value strategy (documented):
# - Numerical features: Impute with mean
# - Categorical features: Impute with most frequent
# Scaling/Encoding:
# - Numerical: StandardScaler
# - Categorical: OneHotEncoder (drop='first')
def create_preprocessing_pipeline(numerical_cols, categorical_cols):
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    return preprocessor

In [18]:

# Step 3: Main function to run the pipeline
def run_pipeline():
    # Load data
    data = load_data()

    # Debug: Print columns to confirm
    print("Columns in dataset:", data.columns.tolist())

    # Define columns (exact match to your dataset)
    numerical_cols = ['CO', 'NO2', 'SO2', 'O3', 'PM2.5', 'PM10']  # Pollutant features
    categorical_cols = ['City']  # Only City is categorical
    target_col = 'AQI'  # Target column
    drop_cols = ['Date']  # Drop Date (not a feature)

    # Drop unnecessary columns
    data = data.drop(columns=drop_cols)

    # Features and target
    X = data.drop(columns=[target_col])
    y = data[target_col]

    # Create and apply pipeline
    preprocessor = create_preprocessing_pipeline(numerical_cols, categorical_cols)
    X_preprocessed = preprocessor.fit_transform(X)

    # Get new column names after encoding
    cat_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    preprocessed_cols = numerical_cols + list(cat_names)
    X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=preprocessed_cols)
    X_preprocessed_df[target_col] = y.values  # Add target back

    # Split data (80/20, reproducible)
    train, test = train_test_split(X_preprocessed_df, test_size=0.2, random_state=42)

    # Add footer row
    footer_row = pd.DataFrame([[FOOTER] * len(train.columns)], columns=train.columns)
    train_with_footer = pd.concat([train, footer_row], ignore_index=True)
    test_with_footer = pd.concat([test, footer_row], ignore_index=True)

    # Export
    train_with_footer.to_csv('cleaned_train.csv', index=False)
    test_with_footer.to_csv('cleaned_test.csv', index=False)

    print(f"Pipeline completed successfully! Footer: {FOOTER}")
    print("Train shape:", train.shape)
    print("Test shape:", test.shape)
    print("Cleaned files exported: cleaned_train.csv and cleaned_test.csv")

In [19]:

if __name__ == "__main__":
    run_pipeline()

Data loaded successfully.
Columns in dataset: ['Date', 'City', 'CO', 'NO2', 'SO2', 'O3', 'PM2.5', 'PM10', 'AQI']
Pipeline completed successfully! Footer: Model by sadia
Train shape: (42048, 12)
Test shape: (10512, 12)
Cleaned files exported: cleaned_train.csv and cleaned_test.csv
