# Data Preprocessing
We use the SocratiQ dataset: https://github.com/NUS-IDS/eacl23_soqg/tree/main

In this section we preprocess this dataset to generate a corpus for training an LLM to generate accurate critical questions.

In [1]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split


In [2]:
################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

socraticQ_chunk_1_path = 'Data/Raw/SocraticQ/train_chunk_I.csv'
socraticQ_chunk_2_path = 'Data/Raw/SocraticQ/train_chunk_II.csv'
socraticQ_chunk_3_path = 'Data/Raw/SocraticQ/train_chunk_III.csv'
train_path = 'Data/Processed/SocraticQ/train.csv'
val_path = 'Data/Processed/SocraticQ/validation.csv'
test_path = 'Data/Processed/SocraticQ/test.csv'

################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler('Logs/data_preprocessing.log')
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)


# Log the device info
logger.info("--------  Start with Data Preprocessing  -------------")

In [3]:
# Load the chunks
chunk1 = pd.read_csv(socraticQ_chunk_1_path)
chunk2 = pd.read_csv(socraticQ_chunk_2_path)
chunk3 = pd.read_csv(socraticQ_chunk_3_path)

# Concatenate the chunks
full_data = pd.concat([chunk1, chunk2, chunk3], ignore_index=True)

# Split the data
train_data, temp_data = train_test_split(full_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=2/3, random_state=42)  # 2/3 of 30% is 20%

# Save the splits
train_data.to_csv(train_path, index=False)
val_data.to_csv(val_path, index=False)
test_data.to_csv(test_path, index=False)
logger.info(f"Created train, validation and test data sets successfully with random_state=42")
logger.info("--------  Finish Data Preprocessing  -------------")