In [1]:
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import os

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join("artifacts", "preprocessor.pkl")

In [4]:
TARGET_COLUMN_NAME = "price"

In [5]:
data_transformation_config = DataTransformationConfig()

In [6]:
df = pd.read_csv(r"Dataset\Clean_Dataset.csv")

In [7]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [37]:
df.drop(['flight'], axis = 1, inplace = True)

In [8]:
df.shape

(300153, 11)

In [9]:
df.drop_duplicates(keep = 'first', inplace = True, ignore_index = True)

In [10]:
df.shape

(300153, 11)

In [38]:
categorical_columns = list(df.loc[:,df.dtypes == 'object'].columns)

In [39]:
df[categorical_columns].head(2)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy


In [14]:
numerical_columns = list(df.loc[:,df.dtypes != 'object'].columns)

In [15]:
numerical_columns.remove('price')

In [16]:
numerical_columns

['duration', 'days_left']

In [40]:
num_pipeline = Pipeline(
                steps = [
                ("imputer" , SimpleImputer(strategy = "median")),
                ("scaler" , StandardScaler())
                ]
            )

In [19]:
type(num_pipeline)

sklearn.pipeline.Pipeline

In [20]:
num_pipeline

In [41]:
cat_pipeline = Pipeline(
                steps= [
                ("imputer" , SimpleImputer(strategy= "most_frequent")),
                ("one_hot_encoder" , OneHotEncoder()),
                ("scaler", StandardScaler(with_mean=False))
                ] 
            )

In [22]:
cat_pipeline

In [42]:
preprocessor = ColumnTransformer(
                [
                ("num_pipeline", num_pipeline, numerical_columns),
                ("cat_pipeline", cat_pipeline, categorical_columns)
                ],
                sparse_threshold = 0
            )

In [25]:
type(preprocessor)

sklearn.compose._column_transformer.ColumnTransformer

In [26]:
preprocessor

In [None]:
preprocessor.fit_transform()

In [43]:
train_df, test_df = train_test_split(df, test_size = 0.25, random_state=12)

In [44]:
print(train_df.shape)
print(test_df.shape)

(225114, 10)
(75039, 10)


In [45]:
train_df_independent_features = train_df.drop([TARGET_COLUMN_NAME], axis = 1)
train_df_target_feature = train_df[TARGET_COLUMN_NAME]

In [46]:
test_df_independent_features = test_df.drop([TARGET_COLUMN_NAME], axis = 1)
test_df_target_feature = test_df[TARGET_COLUMN_NAME]

In [47]:
input_feature_train_arr = preprocessor.fit_transform(train_df_independent_features)

In [34]:
len(input_feature_train_arr)

225114

In [32]:
type(input_feature_train_arr)

numpy.ndarray

In [48]:
input_feature_test_arr = preprocessor.transform(test_df_independent_features)

In [49]:
len(input_feature_test_arr)

75039

In [50]:
type(input_feature_test_arr)

numpy.ndarray

In [56]:
train_arr = np.c_[input_feature_train_arr, train_df_target_feature]
test_arr = np.c_[input_feature_test_arr, test_df_target_feature]

In [55]:
X_train, y_train, X_test, y_test = (
                    train_arr[:,:-1],
                    train_arr[:,-1],
                    test_arr[:,:-1],
                    test_arr[:,-1]
                )

In [58]:
y_train

array([62188., 66928.,  7877., ...,  7490., 60508., 49725.])