In [2]:
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import os

In [32]:
from sklearn.model_selection import train_test_split

In [3]:
@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join("artifacts", "preprocessor.pkl")

In [35]:
TARGET_COLUMN_NAME = "price"

In [4]:
data_transformation_config = DataTransformationConfig()

In [14]:
df = pd.read_csv(r"Dataset\Clean_Dataset.csv")

In [22]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [15]:
df.drop_duplicates(keep = 'first', inplace = True, ignore_index = True)

In [16]:
categorical_columns = list(df.loc[:,df.dtypes == 'object'].columns)

In [19]:
df[categorical_columns].head(2)

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy


In [24]:
num_pipeline = Pipeline(
                steps = [
                ("imputer" , SimpleImputer(strategy = "median")),
                ("scaler" , StandardScaler())
                ]
            )

In [25]:
type(num_pipeline)

sklearn.pipeline.Pipeline

In [26]:
num_pipeline

In [27]:
cat_pipeline = Pipeline(
                steps= [
                ("imputer" , SimpleImputer(strategy= "most_frequent")),
                ("one_hot_encoder" , OneHotEncoder()),
                ("scaler", StandardScaler(with_mean=False))
                ] 
            )

In [28]:
cat_pipeline

In [41]:
preprocessor = ColumnTransformer(
                [
                ("num_pipeline", num_pipeline, numerical_columns),
                ("cat_pipeline", cat_pipeline, categorical_columns)
                ]
            )

In [30]:
type(preprocessor)

sklearn.compose._column_transformer.ColumnTransformer

In [31]:
preprocessor

In [33]:
train_df, test_df = train_test_split(df, test_size = 0.25, random_state=12)

In [34]:
print(train_df.shape)
print(test_df.shape)

(225114, 11)
(75039, 11)


In [36]:
train_df_independent_features = train_df.drop([TARGET_COLUMN_NAME], axis = 1)
train_df_target_feature = train_df[TARGET_COLUMN_NAME]

In [37]:
test_df_independent_features = test_df.drop([TARGET_COLUMN_NAME], axis = 1)
test_df_target_feature = test_df[TARGET_COLUMN_NAME]

In [39]:
numerical_columns = list(test_df_independent_features.columns)
for i in categorical_columns:
    if i in numerical_columns:
        numerical_columns.remove(i)
                    
df[numerical_columns].head(2)

Unnamed: 0,duration,days_left
0,2.17,1
1,2.33,1


In [42]:
input_feature_train_arr = preprocessor.fit_transform(train_df_independent_features)

In [45]:
np.c_[input_feature_train_arr, np.array()]

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 225114

In [47]:
np.array(train_df_target_feature)

array(<225114x1592 sparse matrix of type '<class 'numpy.float64'>'
	with 2251140 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [None]:
input_feature_test_arr = preprocessor.transform(test_df_independent_features)

In [48]:
np.c_[np.array([1,2,3]), np.array([4,5,6])]

array([[1, 4],
       [2, 5],
       [3, 6]])

In [49]:
np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]

array([[1, 2, 3, 0, 0, 4, 5, 6]])