In [1]:
import os

In [2]:
%pwd

'c:\\Users\\PASCAL\\Student_Performance_Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PASCAL\\Student_Performance_Prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    preprocessor_path: Path    

In [6]:
from studentPerformance.constants import *
from studentPerformance.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            preprocessor_path = config.preprocessor_path
        )

        return data_transformation_config

In [8]:
import os
from src.studentPerformance.logger import logging
import sys
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
import pickle

In [9]:


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_data_transformer_obj(self):
        '''
        This function is responsible for data transformation
        '''
        try:
            # Define which columns should be ordinal-encoded and which should be scaled
            numerical_columns=['writing_score','reading_score']
            categorical_columns=[
                'gender',
                'race_ethnicity',
                'parental_level_of_education',
                'lunch',
                'test_preparation_course'
            ]
            
            # Define the custom ranking for each ordinal variable
            gender = ['female', 'male']
            race_ethnicity = ['group B', 'group C', 'group A', 'group D', 'group E']
            parental_level_of_education = ["bachelor's degree", "some college", "master's degree", "associate's degree",
            "high school", "some high school"]
            lunch = ['standard', 'free/reduced']
            test_preparation_course = ['none', 'completed']

            # Numerical Pipeline
            num_pipeline = Pipeline(
                steps = [
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())                
                ]
            )

            # Categorical Pipeline
            cat_pipeline = Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('ordinal_encoder',OrdinalEncoder(categories=[gender,
                race_ethnicity,
                parental_level_of_education,
                lunch,
                test_preparation_course])),
                ('scaler',StandardScaler())
                ]
            )

            logging.info(f'Categorical Columns : {categorical_columns}')
            logging.info(f'Numerical Columns   : {numerical_columns}')

            preprocessor = ColumnTransformer(
                [
                ('num_pipeline',num_pipeline,numerical_columns),
                ('cat_pipeline',cat_pipeline,categorical_columns)
                ]
            )
            
            return preprocessor

        except Exception as e:
            logging.error(f"Error in get_data_transformer_object: {str(e)}")

    def initiate_data_transformation(self):
        try:
            train_data_path = 'artifacts/data_ingestion/unzipped_data/train_data.csv'
            test_data_path = 'artifacts/data_ingestion/unzipped_data/test_data.csv'

            logging.info("Read train and test data completed")

            logging.info("Obtaining preprocessing object")

            # Read training and test data
            train_df = pd.read_csv(train_data_path)
            test_df = pd.read_csv(test_data_path)

            logging.info('Read train and test data completed')
            logging.info(f'Train Dataframe Head : \n{train_df.head().to_string()}')
            logging.info(f'Test Dataframe Head  : \n{test_df.head().to_string()}')

            logging.info('Obtaining preprocessing object')

            preprocessing_obj = self.get_data_transformer_obj()

            target_column_name = 'math_score'

            # Separate input features and target features
            input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
            target_feature_test_df = test_df[target_column_name]

            # Apply the preprocessing object on training and test input features
            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            # Combine input features and target features
            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]
            
            test_arr = np.c_[
                input_feature_test_arr, np.array(target_feature_test_df)
            ]

            # Save preprocessing object
            preprocessing_obj_file = os.path.join("artifacts", 'data_transformation', 'preprocessing_obj.pkl')
            with open(preprocessing_obj_file, 'wb') as file:
                pickle.dump(preprocessing_obj, file)

            logging.info("Saved preprocessing object.")
            logging.info("Transformation of the data is completed")
            
            return (
                train_arr,
                test_arr,
                preprocessing_obj_file
            )
        except Exception as e:
            logging.error(f"Error in initiate_data_transformation: {str(e)}")
        


In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    data_transformation.initiate_data_transformation()
except Exception as e:
    raise e

[2023-08-24 02:52:40,253: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-24 02:52:40,254: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-24 02:52:40,256: INFO: common: created directory at: artifacts]
[2023-08-24 02:52:40,269: INFO: common: created directory at: artifacts/data_transformation]
[2023-08-24 02:52:40,269: INFO: 3862503345: Read train and test data completed]
[2023-08-24 02:52:40,270: INFO: 3862503345: Obtaining preprocessing object]
[2023-08-24 02:52:40,299: INFO: 3862503345: Read train and test data completed]
[2023-08-24 02:52:40,417: INFO: 3862503345: Train Dataframe Head : 
   gender race_ethnicity parental_level_of_education         lunch test_preparation_course  math_score  reading_score  writing_score
0  female        group D             master's degree      standard                    none          62             70             75
1  female        group C           bachelor's degree  free/reduced               complete