# DATA LOAD

--create the conda envt : conda create -n lr_bankloan_env python=3.11 -y

--Activate the Environment:  conda activate lr_bankloan_env

-- Install Required Libraries: 

pip install pandas numpy seaborn matplotlib scikit-learn xgboost catboost flask ipykernel


--Register Kernel for VS Code & Jupyter:

python -m ipykernel install --user --name=lr_bankloan_env --display-name "Python (lr_bankloan_env)"



# ✅ Goal of data_ingestion.py:



Read raw data (CSV)

Save it as-is (optional)

Split into train & test sets

Save the train/test data in the artifacts/ folder

Return the file paths for further steps

# Connected Modules Used in This File:

| **Module**                           | **Purpose**                      |
| ------------------------------------ | -------------------------------- |
| `pandas`                             | Load the CSV file as a DataFrame |
| `train_test_split` from `sklearn`    | Split data into training/testing |
| `os`, `sys`                          | Handle file paths and exceptions |
| `CustomException`, `logging`         | Log and handle errors cleanly    |
| `DataTransformation`, `ModelTrainer` | Trigger next pipeline steps      |


# What you expect after running the data_ingestion.py file?

## ✅ When You Run `data_ingestion.py`, It will:

- Read your raw data from this file:

```bash
notebook/data/loan_data.csv

Create the artifacts/ folder (if it doesn’t already exist)

Save 3 output files inside artifacts/:

| 📁 Location | 📄 File Name | 📄 Content Description                      |
| ----------- | ------------ | ------------------------------------------- |
| artifacts/  | data.csv     | The full original raw dataset (as-is)       |
| artifacts/  | train.csv    | 80% of the data used for training           |
| artifacts/  | test.csv     | 20% of the data used for testing/validation |


Final Output Folder After Running data_ingestion.py:

artifacts/
├── data.csv      ← full raw data
├── train.csv     ← training set (80%)
└── test.csv      ← test set (20%)

Summary:
You do not need to manually create the artifacts/ folder.
✅ It will be created automatically, and all 3 files above will be saved inside.


# Short cut for commmenting and uncommenting :

Ctrl + /   → Comments or uncomments them


# First time just run the following code:


import os
import sys
#  treat the project root (LR_BankLoan) as the root module — so from src.exception will now work.
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
import pandas as pd
from sklearn.model_selection import train_test_split
from dataclasses import dataclass

from src.exception import CustomException
from src.logger import logging

# 🔴 Future steps (commented for now)
# from src.components.data_transformation import DataTransformation
# from src.components.data_transformation import DataTransformationConfig
# from src.components.model_trainer import ModelTrainer
# from src.components.model_trainer import ModelTrainerConfig

# Step 1: Define where to save files
@dataclass
class DataIngestionConfig:
    train_data_path: str = os.path.join('artifacts', "train.csv")
    test_data_path: str = os.path.join('artifacts', "test.csv")
    raw_data_path: str = os.path.join('artifacts', "data.csv")

# Step 2: Create class for data ingestion
class DataIngestion:
    def __init__(self):
        self.ingestion_config = DataIngestionConfig()

    # Step 3: Define ingestion process
    def initiate_data_ingestion(self):
        logging.info("Data Ingestion started")
        try:
            # ✅ Use your actual dataset
            df = pd.read_csv('notebook/data/raw/loan_data.csv')
            logging.info("Read the dataset as pandas DataFrame")

            # Create output folder if it doesn't exist
            os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)

            # Save raw data
            df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
            logging.info("Raw data saved")

            # Split into train and test
            train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

            # Save train and test sets
            train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
            test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)

            logging.info("Train and test data saved")

            return (
                self.ingestion_config.train_data_path,
                self.ingestion_config.test_data_path,
                self.ingestion_config.raw_data_path
            )

        except Exception as e:
            raise CustomException(e, sys) from e

# Step 4: Run and test only data ingestion for now
if __name__ == "__main__":
    data_ingestion = DataIngestion()
    train_data_path, test_data_path, raw_data_path = data_ingestion.initiate_data_ingestion()

    # 🔒 Future steps (enable after building next modules)
    # data_transformation = DataTransformation()
    # data_transformation_config = DataTransformationConfig()
    # model_trainer = ModelTrainer()
    # model_trainer_config = ModelTrainerConfig()
    # train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data_path, test_data_path)
    # print(model_trainer.initiate_model_trainer(train_arr, test_arr))


# second time: Run the following code once you created the data_transforamtion.py module:


import os
import sys
#  treat the project root (LR_BankLoan) as the root module — so from src.exception will now work.
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
import pandas as pd
from sklearn.model_selection import train_test_split
from dataclasses import dataclass

from src.exception import CustomException
from src.logger import logging

from src.components.data_transformation import DataTransformation
from src.components.data_transformation import DataTransformationConfig

from src.components.model_trainer import ModelTrainer
from src.components.model_trainer import ModelTrainerConfig

# Step 1: Define where to save files (using @dataclass)
@dataclass
class DataIngestionConfig:
    train_data_path: str = os.path.join('artifacts', "train.csv")
    test_data_path: str = os.path.join('artifacts', "test.csv")
    raw_data_path: str = os.path.join('artifacts', "data.csv")

# Step 2: Create a class for data ingestion
class DataIngestion:
    def __init__(self):
        self.ingestion_config = DataIngestionConfig()

    # Step 3: Define the ingestion process    
    def initiate_data_ingestion(self):
        logging.info("Data Ingestion started")
        try:
            # use the actual data file: loan_data.csv
            df = pd.read_csv('notebook/data/raw/loan_data.csv')

            logging.info("Read the Dataset as pandas dataframe")

            # Create directories/folders if they don't exist
            os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)

            # Save the raw data
            df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
            logging.info("Raw data saved")

            # Split the data into train and test sets
            train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

            # Save the train and test sets
            train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
            test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)

            logging.info("Train and test data saved")

            return (
                self.ingestion_config.train_data_path,
                self.ingestion_config.test_data_path,
                self.ingestion_config.raw_data_path
            )

        except Exception as e:
            raise CustomException(e, sys) from e

# Step 4: Test the DataIngestion class / Run this step and connect next steps
if __name__ == "__main__":
    data_ingestion = DataIngestion()
    train_data_path, test_data_path, raw_data_path = data_ingestion.initiate_data_ingestion()

    # Step 5: Call the DataTransformation class
    data_transformation = DataTransformation()
    data_transformation_config = DataTransformationConfig()

    # Step 6: Call the ModelTrainer class
    model_trainer = ModelTrainer()
    model_trainer_config = ModelTrainerConfig()

    # Call the data transformation and model trainer methods to Make sure model receives transformed data
    train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data_path, test_data_path)

    print(model_trainer.initiate_model_trainer(train_arr, test_arr))