In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
import os
import joblib
import json
from datetime import datetime

warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

class StudentDepressionPreprocessor:
    def __init__(self, input_file_path, output_dir='data/processed'):
        self.input_file = input_file_path
        self.output_dir = output_dir
        self.raw_data = None
        self.cleaned_data = None
        self.preprocessed_data = None
        self.setup_directories()

    def setup_directories(self):
        """Create necessary directories"""
        dirs = [self.output_dir, 'visualizations', 'models']
        for dir_path in dirs:
            os.makedirs(dir_path, exist_ok=True)
        print("‚úì Directory structure created")

    def load_data(self):
        """Load dataset from CSV or Excel"""
        print("\n" + "="*60)
        print("LOADING DATA")
        print("="*60)
        try:
            if self.input_file.endswith('.xlsx'):
                self.raw_data = pd.read_excel(self.input_file)
            else:
                self.raw_data = pd.read_csv(self.input_file)

            print(f"‚úì Data loaded successfully")
            print(f"  Shape: {self.raw_data.shape}")
            print(f"  Columns: {list(self.raw_data.columns)}")
            return self.raw_data
        except Exception as e:
            print(f"‚úó Error loading data: {e}")
            return None

    def clean_data(self):
        """Clean and prepare the dataset"""
        print("\n" + "="*60)
        print("CLEANING DATA")
        print("="*60)

        self.cleaned_data = self.raw_data.copy()
        initial_shape = self.cleaned_data.shape

        # 1. Handle Target Variable (Depression)
        if 'Depression' in self.cleaned_data.columns:
            print("\n1. Processing Target Variable (Depression):")
            unique_vals = sorted(self.cleaned_data['Depression'].unique())
            print(f"   Original unique values: {unique_vals}")
            print(f"   Value counts:\n{self.cleaned_data['Depression'].value_counts().sort_index()}")

            # Map common encodings to binary 0/1
            # -1 or 2 sometimes used for "Healthy", 0 or 1 for "Depressed"
            if set(unique_vals) == {-1, 1}:
                print("   ‚Üí Mapping: -1‚Üí0 (Healthy), 1‚Üí1 (Depressed)")
                self.cleaned_data['Depression'] = self.cleaned_data['Depression'].map({-1: 0, 1: 1})
            elif set(unique_vals) == {1, 2}:
                print("   ‚Üí Mapping: 1‚Üí0 (Healthy), 2‚Üí1 (Depressed)")
                self.cleaned_data['Depression'] = self.cleaned_data['Depression'].map({1: 0, 2: 1})
            elif set(unique_vals) == {0, 1}:
                print("   ‚Üí Already binary (0, 1). No mapping needed.")
            else:
                print(f"   ‚ö†Ô∏è  WARNING: Unexpected values {unique_vals}")
                print("   ‚Üí Attempting to convert to binary (0=healthy, 1=depressed)")
                # Assume smallest value is healthy (0), largest is depressed (1)
                min_val = min(unique_vals)
                self.cleaned_data['Depression'] = (self.cleaned_data['Depression'] != min_val).astype(int)

            final_vals = sorted(self.cleaned_data['Depression'].unique())
            final_counts = self.cleaned_data['Depression'].value_counts().sort_index()
            print(f"   Final unique values: {final_vals}")
            print(f"   Final value counts:\n{final_counts}")

            if len(final_vals) < 2:
                print("   ‚úó ERROR: Only one class in target! Check your raw data file.")
                return None

        # 2. Remove duplicates
        print("\n2. Removing Duplicates:")
        duplicates = self.cleaned_data.duplicated().sum()
        self.cleaned_data = self.cleaned_data.drop_duplicates()
        print(f"   Removed {duplicates} duplicate rows")

        # 3. Handle missing values
        print("\n3. Handling Missing Values:")
        missing_before = self.cleaned_data.isnull().sum().sum()

        numeric_cols = self.cleaned_data.select_dtypes(include=[np.number]).columns
        categorical_cols = self.cleaned_data.select_dtypes(include=['object']).columns

        # Fill numeric with median
        for col in numeric_cols:
            if col != 'Depression':  # Don't fill target variable
                missing = self.cleaned_data[col].isnull().sum()
                if missing > 0:
                    self.cleaned_data[col] = self.cleaned_data[col].fillna(
                        self.cleaned_data[col].median()
                    )
                    print(f"   {col}: Filled {missing} missing values with median")

        # Fill categorical with mode
        for col in categorical_cols:
            missing = self.cleaned_data[col].isnull().sum()
            if missing > 0:
                mode_value = self.cleaned_data[col].mode()
                if len(mode_value) > 0:
                    self.cleaned_data[col] = self.cleaned_data[col].fillna(mode_value[0])
                    print(f"   {col}: Filled {missing} missing values with mode")

        missing_after = self.cleaned_data.isnull().sum().sum()
        print(f"   Total missing values: {missing_before} ‚Üí {missing_after}")

        # 4. Standardize text in categorical columns
        print("\n4. Standardizing Text:")
        for col in categorical_cols:
            self.cleaned_data[col] = (
                self.cleaned_data[col]
                .astype(str)
                .str.strip()
                .str.title()
            )
        print(f"   Standardized {len(categorical_cols)} categorical columns")

        # 5. Remove unnecessary columns
        print("\n5. Removing Unnecessary Columns:")
        cols_to_drop = ['City', 'id', 'ID', 'Unnamed: 0']
        dropped = []
        for col in cols_to_drop:
            if col in self.cleaned_data.columns:
                self.cleaned_data = self.cleaned_data.drop(columns=[col])
                dropped.append(col)
        if dropped:
            print(f"   Dropped: {dropped}")
        else:
            print("   No unnecessary columns found")

        # 6. Save cleaned data
        output_path = f'{self.output_dir}/cleaned_data.csv'
        self.cleaned_data.to_csv(output_path, index=False)
        print(f"\n‚úì Cleaned data saved to: {output_path}")
        print(f"  Final shape: {initial_shape} ‚Üí {self.cleaned_data.shape}")

        return self.cleaned_data

    def preprocess_for_ml(self):
        """Preprocess data for machine learning"""
        print("\n" + "="*60)
        print("PREPROCESSING FOR MACHINE LEARNING")
        print("="*60)

        if self.cleaned_data is None:
            print("‚úó No cleaned data available. Run clean_data() first.")
            return None

        self.preprocessed_data = self.cleaned_data.copy()
        target_col = 'Depression'

        # Separate features and target
        if target_col not in self.preprocessed_data.columns:
            print(f"‚úó Target column '{target_col}' not found!")
            return None

        # Identify column types
        categorical_cols = self.preprocessed_data.select_dtypes(
            include=['object']
        ).columns.tolist()

        numeric_cols = self.preprocessed_data.select_dtypes(
            include=[np.number]
        ).columns.tolist()

        # Remove target from numeric columns list
        if target_col in numeric_cols:
            numeric_cols.remove(target_col)

        print(f"\n1. Column Types:")
        print(f"   Numeric features: {len(numeric_cols)}")
        print(f"   Categorical features: {len(categorical_cols)}")
        print(f"   Target: {target_col}")

        # 2. One-Hot Encoding for Categorical Variables
        if categorical_cols:
            print(f"\n2. One-Hot Encoding:")
            print(f"   Encoding {len(categorical_cols)} columns...")

            X_categorical = self.preprocessed_data[categorical_cols]
            X_encoded = pd.get_dummies(X_categorical, drop_first=True, dtype=int)

            # Create encoding map
            encoding_map = {}
            for col in categorical_cols:
                new_cols = [c for c in X_encoded.columns if c.startswith(col + '_')]
                encoding_map[col] = new_cols
                print(f"   {col} ‚Üí {len(new_cols)} columns")

            # Save encoding map
            encoding_path = f'{self.output_dir}/encoding_map.json'
            with open(encoding_path, 'w') as f:
                json.dump(encoding_map, f, indent=2)
            print(f"   ‚úì Encoding map saved")

            # Add encoded columns and remove original
            self.preprocessed_data = pd.concat(
                [self.preprocessed_data.drop(columns=categorical_cols), X_encoded],
                axis=1
            )

        # 3. Scale Numeric Features
        if numeric_cols:
            print(f"\n3. Scaling Numeric Features:")
            print(f"   Scaling {len(numeric_cols)} columns...")

            scaler = StandardScaler()
            self.preprocessed_data[numeric_cols] = scaler.fit_transform(
                self.preprocessed_data[numeric_cols]
            )

            # Save scaler
            scaler_path = f'{self.output_dir}/scaler.pkl'
            joblib.dump(scaler, scaler_path)
            print(f"   ‚úì Scaler saved")

        # 4. Save feature names
        feature_cols = [c for c in self.preprocessed_data.columns if c != target_col]
        feature_path = f'{self.output_dir}/feature_names.json'
        with open(feature_path, 'w') as f:
            json.dump(feature_cols, f, indent=2)
        print(f"\n4. Feature Names:")
        print(f"   Total features: {len(feature_cols)}")
        print(f"   ‚úì Feature names saved")

        # 5. Save preprocessed data
        output_path = f'{self.output_dir}/preprocessed_data.csv'
        self.preprocessed_data.to_csv(output_path, index=False)
        print(f"\n‚úì Preprocessed data saved to: {output_path}")
        print(f"  Final shape: {self.preprocessed_data.shape}")

        return self.preprocessed_data

    def generate_summary_report(self):
        """Generate a summary report of the preprocessing"""
        print("\n" + "="*60)
        print("PREPROCESSING SUMMARY")
        print("="*60)

        if self.preprocessed_data is None:
            print("No preprocessed data available")
            return

        summary = {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "raw_data_shape": self.raw_data.shape if self.raw_data is not None else None,
            "cleaned_data_shape": self.cleaned_data.shape if self.cleaned_data is not None else None,
            "preprocessed_data_shape": self.preprocessed_data.shape,
            "target_distribution": self.preprocessed_data['Depression'].value_counts().to_dict() if 'Depression' in self.preprocessed_data.columns else None,
        }

        summary_path = f'{self.output_dir}/preprocessing_summary.json'
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)

        print("Final Dataset Information:")
        print(f"  Samples: {summary['preprocessed_data_shape'][0]}")
        print(f"  Features: {summary['preprocessed_data_shape'][1] - 1}")  # -1 for target
        if summary['target_distribution']:
            print(f"  Target Distribution:")
            for k, v in summary['target_distribution'].items():
                label = "Healthy" if k == 0 else "Depressed"
                print(f"    {label} ({k}): {v} ({v/sum(summary['target_distribution'].values())*100:.1f}%)")

        print(f"\n‚úì Summary saved to: {summary_path}")

    def run_pipeline(self):
        """Run the complete preprocessing pipeline"""
        print("\n" + "üîÑ STARTING PREPROCESSING PIPELINE üîÑ".center(60))

        # Load data
        if self.load_data() is None:
            print("\n‚úó Pipeline failed: Could not load data")
            return False

        # Clean data
        if self.clean_data() is None:
            print("\n‚úó Pipeline failed: Could not clean data")
            return False

        # Preprocess for ML
        if self.preprocess_for_ml() is None:
            print("\n‚úó Pipeline failed: Could not preprocess data")
            return False

        # Generate summary
        self.generate_summary_report()

        print("\n" + "‚úÖ PREPROCESSING COMPLETED SUCCESSFULLY! ‚úÖ".center(60))
        print("\nOutput files created:")
        print(f"  üìÅ {self.output_dir}/cleaned_data.csv")
        print(f"  üìÅ {self.output_dir}/preprocessed_data.csv")
        print(f"  üìÅ {self.output_dir}/scaler.pkl")
        print(f"  üìÅ {self.output_dir}/encoding_map.json")
        print(f"  üìÅ {self.output_dir}/feature_names.json")
        print(f"  üìÅ {self.output_dir}/preprocessing_summary.json")

        return True


if __name__ == "__main__":
    # IMPORTANT: Update this path to match your file location
    INPUT_FILE = "data/raw/student_depression_dataset (1).csv"
    OUTPUT_DIR = "data/processed"

    # Create preprocessor and run
    preprocessor = StudentDepressionPreprocessor(INPUT_FILE, OUTPUT_DIR)
    success = preprocessor.run_pipeline()

    if success:
        print("\nüéâ Ready for model training!")
    else:
        print("\n‚ö†Ô∏è  Please check the errors above and fix your data.")

‚úì Directory structure created

            üîÑ STARTING PREPROCESSING PIPELINE üîÑ             

LOADING DATA
‚úì Data loaded successfully
  Shape: (27901, 18)
  Columns: ['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Depression']

CLEANING DATA

1. Processing Target Variable (Depression):
   Original unique values: [np.int64(0), np.int64(1)]
   Value counts:
Depression
0    11565
1    16336
Name: count, dtype: int64
   ‚Üí Already binary (0, 1). No mapping needed.
   Final unique values: [np.int64(0), np.int64(1)]
   Final value counts:
Depression
0    11565
1    16336
Name: count, dtype: int64

2. Removing Duplicates:
   Removed 0 duplicate rows

3. Handling Missing Values:
   Total missing values: 0 ‚Üí 0

4. Standardizing Text:
   