In [1]:
import os
import sys
sys.path.append('C:/ALL_FROM_DESKTOP/Data_Science_ENDtoEND proj/proj_1/src')


In [2]:
%pwd

'c:\\ALL_FROM_DESKTOP\\Data_Science_ENDtoEND proj\\proj_1\\research'

In [3]:
os.chdir('c:\\ALL_FROM_DESKTOP\\Data_Science_ENDtoEND proj\\proj_1'
)

In [4]:
%pwd

'c:\\ALL_FROM_DESKTOP\\Data_Science_ENDtoEND proj\\proj_1'

# **Entity/init.py**

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    rawdata_dir: Path
    artifacts_root: Path


# **Confi/configaration.py**

In [6]:
from src.mathematicsScore.constants import *
from src.mathematicsScore.utils.common import read_yaml, create_directories
from src.mathematicsScore.entity import DataIngestionConfig


In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.RAWDATA
        )

        return data_ingestion_config

# **Componets/data_ingestion.py**

In [8]:
import os
from src.mathematicsScore.logging import logger
from src.mathematicsScore.config.configuration import ConfigurationManager
from src.mathematicsScore.utils.common import get_size
from src.mathematicsScore.entity import DataIngestionConfig

Config file path: C:\ALL_FROM_DESKTOP\Data_Science_ENDtoEND proj\proj_1\config\config.yaml
Params file path: C:\ALL_FROM_DESKTOP\Data_Science_ENDtoEND proj\proj_1\params.yaml


In [None]:
import os
import urllib.request as request
from src.mathematicsScore.logging import logger
from src.mathematicsScore.utils.common import get_size
from pathlib import Path
from src.mathematicsScore.entity import DataIngestionConfig
import os
import pandas as pd


class DataIngestion:
    """Class to handle data ingestion from the 'RAWDATA' folder."""

    def __init__(self, config: DataIngestionConfig):
        """Initializes the DataIngestion class with the configuration."""
        self.config = config
        self.rawdata_folder_path = self.config.source_URL
    
    def check_rawdata_folder_exists(self):
        """Checks if the RAWDATA folder exists."""
        if not os.path.exists(self.rawdata_folder_path):
            raise FileNotFoundError(f"The folder 'RAWDATA' does not exist at path: {self.rawdata_folder_path}")
    
    def list_files_in_rawdata(self):
        """Lists all files in the RAWDATA folder."""
        return os.listdir(self.rawdata_folder_path)
    
    def read_csv_file(self, file_path):
        """Reads a CSV file into a pandas dataframe."""
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            logger.error(f"Error reading {file_path}: {e}")
            return None
    
    def ingest_data_from_rawdata_folder(self):
        """Ingests data from all CSV files in the RAWDATA folder."""
        self.check_rawdata_folder_exists()  # Check if the folder exists
        files = self.list_files_in_rawdata()  # Get list of files in the RAWDATA folder
        
        # Initialize an empty list to store dataframes info
        processed_files = []

        # Loop over each file in the folder
        for file in files:
            file_path = os.path.join(self.rawdata_folder_path, file)

            # Only process CSV files
            if file.endswith('.csv'):
                logger.info(f"Reading data from {file_path}")
                df = self.read_csv_file(file_path)  # Read the CSV file

                if df is not None:  # Ensure the file was read successfully
                    # Prepare the output file path
                    output_file_path = os.path.join(self.config.root_dir, file)
                    
                    # Check if the file already exists to avoid overwriting
                    if os.path.exists(output_file_path):
                        logger.warning(f"{file} already exists in the output folder. It will be overwritten.")

                    # Save the CSV to the output folder
                    df.to_csv(output_file_path, index=False)
                    logger.info(f"Saved {file} to {output_file_path} with {len(df)} rows")

                    # Append processed file details to the list
                    processed_files.append({
                        'filename': file,
                        'path': output_file_path,
                        'rows': len(df),
                        'columns': len(df.columns),
                        'dataframe': df
                    })
                else:
                    logger.error(f"Skipping {file} due to read error.")

        logger.info(f"Data ingestion completed. Processed {len(processed_files)} files.")
        return processed_files


Reading data from C:/ALL_FROM_DESKTOP/Data_Science_ENDtoEND proj/proj_1\RAWDATA\raw_data.csv
Reading data from C:/ALL_FROM_DESKTOP/Data_Science_ENDtoEND proj/proj_1\RAWDATA\test_data.csv
Reading data from C:/ALL_FROM_DESKTOP/Data_Science_ENDtoEND proj/proj_1\RAWDATA\train_data.csv
   gender race_ethnicity parental_level_of_education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test_preparation_course  math_score  reading_score  writing_score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                

# **pipeline/stage_01_data_ingestion.py**

In [10]:
from src.mathematicsScore.config.configuration import ConfigurationManager
from src.mathematicsScore.entity import DataIngestionConfig
from src.mathematicsScore.logging import logger

In [11]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    logger.info(f"Data ingestion config: {data_ingestion_config}")
except Exception as e:
    logger.error(f"Error occurred: {e}")