In [2]:
import os
from pathlib import Path

In [3]:
%pwd

'/home/priyanshu1303d/Projects/DeepQA_PyTorch/research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'/home/priyanshu1303d/Projects/DeepQA_PyTorch'

In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('artifacts/data_ingestion/Dataset/100_Unique_QA_Dataset.csv')
df.head(10)

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
5,Who painted the Mona Lisa?,Leonardo-da-Vinci
6,What is the square root of 64?,8
7,What is the chemical symbol for gold?,Au
8,Which year did World War II end?,1945
9,What is the longest river in the world?,Nile


In [8]:
df.isnull().sum()

question    0
answer      0
dtype: int64

In [9]:
%pwd

'/home/priyanshu1303d/Projects/DeepQA_PyTorch'

In [10]:
from dataclasses import dataclass


In [42]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir : Path
    data_path : Path
    output_dir : Path
    vocab_file_path: Path

In [43]:
from DeepQA.utils.common import read_yaml , create_directories , get_size
from DeepQA.logging import logger
from DeepQA.constants import *

In [44]:
class ConfigurationManager:
    def __init__(self , config_filepath = CONFIG_FILE_PATH , params_filepath = PARAMS_FILE_PATH ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])
        data_transformation = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            vocab_file_path = config.vocab_file_path,
            output_dir= config.output_dir
        )

        return data_transformation


In [46]:
import re
import pickle
import torch
import json

In [64]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.vocab = {
            '<PAD>': 0,
            '<UNK>': 1,
            '<SOS>': 2,
            '<EOS>': 3
        }

    def tokenize(self, text: str):
        """Tokenizes and cleans the input text."""
        text = text.lower()
        text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
        return text.split()

    def build_vocab(self, dataset):
        """Builds a vocabulary from the dataset."""
        for _, row in dataset.iterrows():
            tokens = self.tokenize(row['question']) + self.tokenize(row['answer'])
            for token in tokens:
                if token not in self.vocab:
                    self.vocab[token] = len(self.vocab)

    def text_to_indices(self, text: str):
        """Converts a single sentence into a list of indices."""
        return [self.vocab.get(token, self.vocab['<UNK>']) for token in self.tokenize(text)]

    def df_to_indices(self, df: pd.DataFrame):
        """Converts an entire DataFrame's 'question' and 'answer' columns into indexed lists."""
        df['question_indices'] = df['question'].apply(self.text_to_indices)
        df['answer_indices'] = df['answer'].apply(self.text_to_indices)
        return df

    def load_dataset(self):
        """Loads the dataset from the specified data path."""
        data_file = self.config.data_path
        data_path = Path(data_file)
        return pd.read_csv(data_path)  # Modify this if using a different format
    

    def save_dataset(self, df: pd.DataFrame, format: str = "csv"):
        """Saves the dataset to the specified output directory in the chosen format."""
        
        output_path = Path(self.config.output_dir)
        output_path.mkdir(parents=True, exist_ok=True)  # Ensure directory exists

        file_path = output_path / f"preprocessed_data.{format}"

        if format == "csv":
            df.to_csv(file_path, index=False)
        elif format == "json":
            df.to_json(file_path, orient="records", lines=True)
        elif format == "pkl":
            with open(file_path, "wb") as f:
                pickle.dump(df, f)
        elif format == "pt":
            torch.save(df, file_path)
        else:
            raise ValueError("Unsupported format! Choose from 'csv', 'json', 'pkl', or 'pt'.")

        print(f"Dataset saved at: {file_path}")

        vocab_dir = Path(self.config.vocab_file_path)
        vocab_dir.mkdir(parents=True, exist_ok=True)    

        vocab_file_path = vocab_dir / "vocab.json"

        with open(vocab_file_path, "w") as f:
            json.dump(self.vocab, f, indent=4)

        print(f"Vocabulary saved at: {vocab_file_path}")


In [66]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation()
    data_transformation = DataTransformation(data_transformation_config)
    
    df = data_transformation.load_dataset()

    # Build vocabulary
    data_transformation.build_vocab(df)

    # Convert text to indices
    df = data_transformation.df_to_indices(df)

    data_transformation.save_dataset(df)
except Exception as e:
    raise e

[2025-04-03 10:58:27,574 : INFO : common  : yaml file config/config.yaml was read succesfully]
[2025-04-03 10:58:27,577 : INFO : common  : yaml file params.yaml was read succesfully]
[2025-04-03 10:58:27,578 : INFO : common  : Created directory at : artifacts]
[2025-04-03 10:58:27,579 : INFO : common  : Created directory at : artifacts/data_transformation]
Dataset saved at: artifacts/data_transformation/Preprocessed_Data/preprocessed_data.csv
Vocabulary saved at: artifacts/data_transformation/Vocab/vocab.json


In [None]:
""