In [1]:
import os

In [2]:
os.chdir("../../")

In [3]:
%pwd

'/Users/bhikipallai/Desktop/Projects/95Mobiles'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class CarDatSPlitConfig:
    root_dir: Path
    clean_data: Path
    X_train: Path
    X_test: Path
    y_train: Path
    y_test: Path
    test_size: float
    random_state: int

In [5]:
from src.car.utils.common import read_yaml,create_directory
from src.car.constants import *

[2024-10-01 11:16:33,087: INFO]: Logging file start


In [6]:
class ConfigurationManger:
    def __init__(self,config_filepath = CONFIG_FILE_PATH,param_filepath = PARAM_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(param_filepath)

        create_directory([self.config.data_root])

    def get_data_split(self)->CarDatSPlitConfig:
        config = self.config.car_data_split
        params = self.params.split

        create_directory([config.root_dir])

        get_ds = CarDatSPlitConfig(
            root_dir=config.root_dir,
            clean_data = config.clean_data,
            X_train = config.X_train,
            X_test = config.X_test,
            y_train = config.y_train,
            y_test = config.y_test,
            test_size = params.test_size,
            random_state = params.random_state
        )

        return get_ds

In [7]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [8]:
class CarDataSplit:
    def __init__(self,config:CarDatSPlitConfig):
        self.config = config
    @staticmethod
    def split_the_data(self):
        df = pd.read_csv(self.config.clean_data)

        X = df.drop(columns='selling_price',axis=1)
        y = df['selling_price']

        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=self.config.test_size,random_state=self.config.random_state)

        return X_train,X_test,y_train,y_test
    
    def save_the_split(self):
        X_train,X_test,y_train,y_test = self.split_the_data(self)
        X_train.to_csv(self.config.X_train,index = False)
        X_test.to_csv(self.config.X_test,index = False)
        y_train.to_csv(self.config.y_train,index = False)
        y_test.to_csv(self.config.y_test,index = False)

        print(X_train.columns)

    def remove_unwanted_cols(self):
        X_train = pd.read_csv(self.config.X_train)
        X_test = pd.read_csv(self.config.X_test)

        X_train.drop(columns='Unnamed: 0',axis=1,inplace=True)
        X_test.drop(columns='Unnamed: 0',axis=1,inplace=True)
        X_train.to_csv(self.config.X_train,index = False)
        X_test.to_csv(self.config.X_test,index = False)

        print("after clean:\n",X_train.head(1))
        print("after clean:\n",X_test.head(1))


In [9]:
try:
    config = ConfigurationManger()
    split_config = config.get_data_split()
    car_split = CarDataSplit(config=split_config)
    car_split.save_the_split()
    car_split.remove_unwanted_cols()
except Exception as e:
    raise e

[2024-10-01 11:16:33,864: INFO]: yaml: config/car_config.yaml loaded successfully
[2024-10-01 11:16:33,865: INFO]: yaml: params/car_params.yaml loaded successfully
[2024-10-01 11:16:33,865: INFO]: created directory at: data
[2024-10-01 11:16:33,866: INFO]: created directory at: data/car/data_split
Index(['Unnamed: 0', 'name', 'year', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')
after clean:
    name      year  km_driven  fuel  seller_type  transmission  owner
0  1151  0.690229  -0.990931     4            1             1      0
after clean:
    name      year  km_driven  fuel  seller_type  transmission  owner
0  1389  0.927485   -0.40854     1            2             0      0
