In [1]:
import os

In [2]:
%pwd

'd:\\MLOPS\\ML-Approach-for-Predict-Cancellation-Prevent-Loss-with-MLflow\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd 

'd:\\MLOPS\\ML-Approach-for-Predict-Cancellation-Prevent-Loss-with-MLflow'

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: str
    preprocessing_data : str
    train: Path
    test: Path

In [7]:
from src.constants import *
from src.utils.common import *

In [8]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,      
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path= config.data_path,
            preprocessing_data = config.preprocessing_data,
            train = config.train,
            test = config.test
        )

        return data_transformation_config

In [6]:
import os
import sys
from src.utils.logger import logging
from src.utils.exception import CustomException
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import  train_test_split
import pandas as pd
import numpy as np

In [10]:
class DataTransformation:
    def __init__(self, config=DataTransformationConfig):
        self.config = config 
    
    
    def save_preprocessing_data(self):
        try:
            df = pd.read_csv(self.config.data_path)
            df["booking status"] = df["booking status"].apply(lambda x : 1 if x=='Not_Canceled' else 0) 

            df = df[~df["date of reservation"].str.contains("-")]

            df["date of reservation"] = pd.to_datetime(df["date of reservation"])
            df["month"] = df["date of reservation"].dt.month

            df.drop(['Booking_ID','date of reservation'], axis=1, inplace=True)

            df.to_csv(self.config.preprocessing_data, index=False, header=True)
            logging.info("Saving pre processing data")

            return "Preprocessing data saved successfully"

        except Exception as e:
            logging.info("Error occured in saving preprocess data")
            raise CustomException(e,sys)
        


    def get_data_transformer_object(self):
        try:
            self.save_preprocessing_data()

            df = pd.read_csv(self.config.preprocessing_data)
            X = df.drop('booking status',axis=1)
            y = df['booking status']

            num_feature = X.select_dtypes(exclude="object").columns 
            cat_feature = X.select_dtypes(include="object").columns

            numeric_transformer = StandardScaler()
            oh_transformer = OneHotEncoder()

            data_transformer = ColumnTransformer(
                [
                    ("OneHotEncoder", oh_transformer, cat_feature),
                    ("StandardScaler", numeric_transformer, num_feature),        
                ]
            )
            
            preprocessor = Pipeline(steps=[("data_transformer", data_transformer)])

            return preprocessor

        except Exception as e:
            logging.info(f"Error in creating data transformation object: {e}")
            raise CustomException(e,sys)
        


    def train_test_split(self):
        try:
            df = pd.read_csv(self.config.preprocessing_data)
            logging.info("Reading dataset as dataframe")

            logging.info("Initiate splitting dataset as train & test set")
            train_set, test_set = train_test_split(df)

            train_set.to_csv(
                os.path.join(self.config.root_dir,"train.csv"), index=False
            )

            test_set.to_csv(
                os.path.join(self.config.root_dir,"test.csv"), index=False
            )

            logging.info("Dataset splitted into train and test set")
            logging.info(f"{train_set.shape}")
            logging.info(f"{test_set.shape}")

            return (
                self.config.train,
                self.config.test
            )
        
        except Exception as e:
            logging.info("Error in splitting train and test set")
            raise CustomException(e,sys)


    def initiate_data_transformation(self, train_path, test_path):
        try:
            train_set = pd.read_csv(train_path)
            test_set = pd.read_csv(test_path)
            logging.info("Reading train and test dataset completed")

            logging.info("Obtaining preprocessor object")

            preprocessor_obj =  self.get_data_transformer_object() 

            target_feature = "booking status"
            
            independent_feature = list(['number of adults', 'number of children', 'number of weekend nights',
            'number of week nights', 'type of meal', 'car parking space',
            'room type', 'lead time', 'market segment type', 'repeated', 'P-C',
            'P-not-C', 'average price', 'special requests', 'month'])
            
            logging.info("Dropping target feature from train and test dataframe")
            input_train_df = train_set.drop(columns=[target_feature], axis=1)
            target_train_df = train_set[target_feature]

            input_test_df = test_set.drop(columns=[target_feature], axis=1)
            target_test_df = test_set[target_feature]

            logging.info("Applying preprocessor object on training and test dataframe")
            
            train_df = preprocessor_obj.fit_transform(input_train_df)
            test_df = preprocessor_obj.transform(input_test_df)
            
            logging.info("Successfully applied preprocessor object on train and test data")

            train_arr = np.c_[train_df, np.array(target_train_df)]
            test_arr = np.c_[test_df, np.array(target_test_df)]
            logging.info("Returning train and test array")
            
            logging.info("Saving train and test set in numpy file")
            np.save(os.path.join(self.config.root_dir,"train_arr.npy"), train_arr)
            np.save(os.path.join(self.config.root_dir,"test_arr.npy"), test_arr)

            return (
                train_arr,
                test_arr
            )

        except Exception as e:
            logging.info("Error occured in data transformation")
            raise CustomException(e,sys)


In [11]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    preprocessor = data_transformation.get_data_transformer_object()
    train_path,test_path = data_transformation.train_test_split()
    train_arr, test_arr = data_transformation.initiate_data_transformation(train_path,test_path)
    
except Exception as e:
    raise CustomException(e,sys)

In [7]:
p = "artifacts/data_transformation/test_arr.npy"
n = np.load(p)

In [12]:
n.shape

(9062, 29)

In [11]:
#y= n[:,-1]
x = n[:, :-1]
x.shape

(9062, 28)

In [15]:
n[0:3]

array([[ 0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.30033044, -0.26057114, -0.92627015, -0.14534151,
        -0.1806325 ,  1.0989402 , -0.16149425, -0.06000587, -0.08872295,
        -0.50567843, -0.78614507,  0.18560467,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.30033044, -0.26057114, -0.92627015, -0.85792117,
        -0.1806325 ,  1.55187113, -0.16149425, -0.06000587, -0.08872295,
         0.1367181 , -0.78614507,  0.51101733,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
  