In [1]:
import os

In [2]:
%pwd

'd:\\end to end mental_health_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\end to end mental_health_prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    drop_columns: List[str]
    columns_to_fillna: Dict[str, str]
    columns_to_drop_due_to_missing: List[str]
    top_features: int
    scaler_path: Path
    processed_train_data_path: Path
    processed_test_data_path: Path


In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml,create_directories


In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        # Force convert to Path object in case they are strings
        config_filepath = Path(config_filepath)
        params_filepath = Path(params_filepath)
        schema_filepath = Path(schema_filepath)

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self):
        config = self.config.data_transformation
        schema = self.schema

        return DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            drop_columns=schema.get("drop_columns", []),
            columns_to_fillna=schema.get("columns_to_fillna", {}),
            columns_to_drop_due_to_missing=schema.get("columns_to_drop_due_to_missing", []),
            top_features=self.params.data_transformation.top_features,
            scaler_path=Path(config.scaler_path),
            processed_train_data_path=Path(config.processed_train_data_path),
            processed_test_data_path=Path(config.processed_test_data_path)
        )


In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
import joblib
import logging


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def load_data(self):
        logging.info(f"📄 Loading data from: {self.config.data_path}")
        return pd.read_csv(self.config.data_path)

    def drop_columns(self, df):
        logging.info("🧹 Dropping unnecessary columns")
        cols_to_drop = ["Timestamp", "comments", "state"] if "Timestamp" in df.columns else []
        return df.drop(columns=cols_to_drop, errors='ignore')

    def fill_missing(self, df):
        logging.info("🩹 Filling missing values")
        return df.fillna(df.mode().iloc[0])

    def clean_age(self, df):
        logging.info("🔢 Cleaning Age column")
        df = df.copy()
        df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
        df = df[(df['Age'] >= 10) & (df['Age'] <= 100)]
        return df

    def normalize_gender(self, df):
        logging.info("🚻 Normalizing Gender column")
        df = df.copy()
        df['Gender'] = df['Gender'].astype(str).str.lower().str.strip()

        def clean_gender(val):
            if 'male' in val:
                return 'male'
            elif 'female' in val:
                return 'female'
            else:
                return 'other'

        df['Gender'] = df['Gender'].apply(clean_gender)
        return df

    def reduce_countries(self, df):
        logging.info("🌍 Reducing countries to top 4")
        df = df.copy()
        top_4 = df['Country'].value_counts().nlargest(4).index.tolist()
        df['Country'] = df['Country'].apply(lambda x: x if x in top_4 else 'Other')
        return df

    def encode_features(self, df):
        logging.info("🏷️ Encoding categorical columns")
        df = df.copy()
        le = LabelEncoder()

        cat_cols = df.select_dtypes(include='object').columns.tolist()

        for col in cat_cols:
            # Convert all to str to avoid 'str' & 'bool' mixed error
            df[col] = df[col].astype(str)
            df[col] = le.fit_transform(df[col])
        return df

    def select_features(self, X, y):
        logging.info("✨ Selecting top features")
        selector = SelectKBest(score_func=f_classif, k=self.config.top_features)
        X_new = selector.fit_transform(X, y)
        return X_new, selector.get_support(indices=True)

    def scale_features(self, X):
        logging.info("📏 Scaling features")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # ✅ Ensure scaler path directory exists
        scaler_dir = os.path.dirname(self.config.scaler_path)
        os.makedirs(scaler_dir, exist_ok=True)
        joblib.dump(scaler, self.config.scaler_path)
        return X_scaled

    def split_and_save(self, X, y):
        logging.info("✂️ Splitting data and saving")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        train = pd.DataFrame(X_train)
        train['treatment'] = y_train.reset_index(drop=True)
        test = pd.DataFrame(X_test)
        test['treatment'] = y_test.reset_index(drop=True)

        # ✅ Ensure output directories exist
        os.makedirs(os.path.dirname(self.config.processed_train_data_path), exist_ok=True)
        os.makedirs(os.path.dirname(self.config.processed_test_data_path), exist_ok=True)

        train.to_csv(self.config.processed_train_data_path, index=False)
        test.to_csv(self.config.processed_test_data_path, index=False)

    def run(self):
        df = self.load_data()
        df = self.drop_columns(df)
        df = self.fill_missing(df)
        df = self.clean_age(df)
        df = self.normalize_gender(df)
        df = self.reduce_countries(df)
        df = self.encode_features(df)

        X = df.drop(columns='treatment')
        y = df['treatment']

        X_selected, selected_indices = self.select_features(X, y)
        X_scaled = self.scale_features(X_selected)
        self.split_and_save(X_scaled, y)


In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.run()
except Exception as e:
    raise e


[2025-07-20 21:14:36,082: INFO: common: yaml file: D:\end to end mental_health_prediction\config.yaml loaded successfully]
[2025-07-20 21:14:36,082: INFO: common: yaml file: D:\end to end mental_health_prediction\params.yaml loaded successfully]
[2025-07-20 21:14:36,089: INFO: common: yaml file: D:\end to end mental_health_prediction\schema.yaml loaded successfully]
[2025-07-20 21:14:36,092: INFO: common: created directory at: artifacts]
[2025-07-20 21:14:36,092: INFO: 511715814: 📄 Loading data from: artifacts\data_ingestion\unzipped_data\survey.csv]
[2025-07-20 21:14:36,103: INFO: 511715814: 🧹 Dropping unnecessary columns]
[2025-07-20 21:14:36,104: INFO: 511715814: 🩹 Filling missing values]
[2025-07-20 21:14:36,118: INFO: 511715814: 🔢 Cleaning Age column]
[2025-07-20 21:14:36,118: INFO: 511715814: 🚻 Normalizing Gender column]
[2025-07-20 21:14:36,123: INFO: 511715814: 🌍 Reducing countries to top 4]
[2025-07-20 21:14:36,126: INFO: 511715814: 🏷️ Encoding categorical columns]
[2025-07-20