In [1]:
import os

In [2]:
%pwd

'c:\\Users\\smith\\PycharmProjects\\End-to-End-Data-Science-Project-Using-MLOPS\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\smith\\PycharmProjects\\End-to-End-Data-Science-Project-Using-MLOPS'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [8]:
import os
from mlproject import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def label_encode_column(self, df):
        label_encoder = LabelEncoder()
        df["Type"] = label_encoder.fit_transform(df["P_Type"])
        logger.info("Label encoded for column: Type")
        df = df.drop(['UDI', 'Type', 'Product ID'], axis=1)
        logger.info("Dropped columns: UDI, Type, Product ID")
        return df

    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)

        # Label encode the column
        data = self.label_encode_column(data)

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve, auc

In [31]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def label_encode_column(self, df):
        label_encoder = LabelEncoder()
        df['PType'] = label_encoder.fit_transform(df['Type'])
        logger.info("Label encoded for column: Type")
        df = df.drop(['UDI', 'Type', 'Product ID'], axis=1)
        logger.info("Dropped columns: UDI, Type, Product ID")
        
        return df

    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)

        # Label encode the column
        data = self.label_encode_column(data)

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)


In [32]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2024-05-27 21:13:35,383: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-27 21:13:35,384: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-27 21:13:35,386: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-05-27 21:13:35,387: INFO: common: created directory at: artifacts]
[2024-05-27 21:13:35,387: INFO: common: created directory at: artifacts/data_transformation]
[2024-05-27 21:13:35,401: INFO: 2949411736: Label encoded for column: Type]
[2024-05-27 21:13:35,402: INFO: 2949411736: Dropped columns: UDI, Type, Product ID]
[2024-05-27 21:13:35,431: INFO: 2949411736: Splited data into training and test sets]
[2024-05-27 21:13:35,432: INFO: 2949411736: (7500, 12)]
[2024-05-27 21:13:35,432: INFO: 2949411736: (2500, 12)]
(7500, 12)
(2500, 12)


In [33]:
import pandas as pd

class CSVReader:
    def __init__(self, filepath):
        self.filepath = filepath
    
    def get_head(self, n=5):
        df = pd.read_csv(self.filepath)
        return df.head(n)


In [34]:
df=CSVReader('artifacts\\data_transformation\\train.csv')

In [35]:
df.get_head(n=6)

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,PType
0,300.4,311.6,1200,76.6,3,1,0,0,1,0,0,1
1,298.0,307.6,1420,45.8,150,0,0,0,0,0,0,0
2,299.1,308.5,1587,35.6,2,0,0,0,0,0,0,2
3,301.6,310.7,1429,43.6,176,0,0,0,0,0,0,1
4,302.1,311.4,1686,30.9,106,0,0,0,0,0,0,1
5,299.0,310.2,1657,32.3,64,0,0,0,0,0,0,2
