In [1]:
import os

In [2]:
pwd

'c:\\Users\\Robin Aluma\\Desktop\\Car_Price_Pred\\Resources'

In [3]:
os.chdir('../')

In [4]:
from dataclasses import dataclass
from pathlib import Path
from src.Car_Price_Pred.constants import *
import yaml
from src.Car_Price_Pred.utils.common import read_yaml,create_directories
import pandas as pd
from Exceptions import CustomException
import sys

@dataclass
class DataTransformationConfig():
    # Data transformation configuration done here
    root_url: Path
    data_dir: Path
    
    

In [5]:
# Defining the configuration manager
class TransformationConfigurationManager:
    def __init__(self, config_file_path = CONFIG_FILE_PATH,):
        self.config = read_yaml(config_file_path)
        
        # creating folder
        create_directories([self.config.Artifacts_root])
    # Accesing the configuration manager
    
    def get_data_transformation_config(self)->DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_url])
        
        get_data_transformation_config = DataTransformationConfig(
            root_url = config.root_url,
            data_dir = config.data_dir
        )
        
        return get_data_transformation_config

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
import numpy as np

In [7]:
class DataTransformation():
    def __init__(self,config:DataTransformationConfig):
        self.config = config
        
        
    def get_data_transformer(self):
        taxi_data_path = self.config.data_dir
        folder_path = self.config.root_url
        
        taxi_data = pd.read_csv(taxi_data_path)
        x = taxi_data.iloc[:,0:-1]
        y = taxi_data.iloc[:,-1]
        
        x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)
        numerical_columns = x.select_dtypes(exclude=['object']).columns
        categorical_columns = x.select_dtypes(include=['object']).columns
        

        imputer = KNNImputer(n_neighbors=2)
        y_train = imputer.fit_transform(y_train)
            
        transformer_categorical = ColumnTransformer(
          transformers = [
             ('categorical_pipeline', Pipeline([
                 ('imputer', SimpleImputer(strategy='most_frequent')),
                 ('encoder', OrdinalEncoder()),
                ('scaler', StandardScaler())
                 ]), categorical_columns)])

        transformer_numerical = ColumnTransformer(
          transformers = [
             ('numerical_pipeline', Pipeline([
                 ('imputer', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())
                      ]), numerical_columns)])
        
        preprocessor = ColumnTransformer([
             ('categorical', transformer_categorical, categorical_columns),
             ('numerical', transformer_numerical, numerical_columns)])
        
        
        
        
        scaled_x_train = preprocessor.fit_transform(x_train)
        scaled_x_test = preprocessor.transform(x_test)
        
        scaled_train_df = pd.DataFrame(scaled_x_train,columns = preprocessor.get_feature_names_out())
        scaled_test_df = pd.DataFrame(scaled_x_test,columns = preprocessor.get_feature_names_out())
        
        scaled_train_array = np.c_[scaled_train_df,np.array(y_train)]
        scaled_test_array = np.c_[scaled_test_df,np.array(y_test)]
        
        scaled_train = pd.DataFrame(scaled_train_array)
        scaled_test = pd.DataFrame(scaled_test_array)
        
        scaled_train.to_csv(os.path.join(folder_path,'train_data.csv'),index=False)
        scaled_test.to_csv(os.path.join(folder_path,'test_data.csv'),index=False)
        
    
    

In [8]:
# Trnasformation pipeline
try:
    config = TransformationConfigurationManager()
    get_data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=get_data_transformation_config)
    data_transformation.get_data_transformer()
except Exception as e:
    raise CustomException(e,sys)

[ 2025-01-23 02:10:59,661 : common : INFO : Creates directory for path;artifacts in file paths :['artifacts'] ]
[ 2025-01-23 02:10:59,662 : common : INFO : Creates directory for path;artifacts/data_transformation in file paths :['artifacts/data_transformation'] ]
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Robin Aluma\AppData\Local\Temp\ipykernel_18940\2003774241.py", line 6, in <module>
    data_transformation.get_data_transformer()
  File "C:\Users\Robin Aluma\AppData\Local\Temp\ipykernel_18940\1093487727.py", line 20, in get_data_transformer
    y_train = imputer.fit_transform(y_train)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Robin Aluma\Desktop\Car_Price_Pred\Car_price\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Robin Aluma\Desktop\Car_Price_Pred\Car_price\Lib\site-packages\sklearn\base.py", line 918, in fit_transform
    return self.fit(X, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Robin Aluma\Desktop\Car_Price_Pred\Car_price\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
       