In [1]:
import pandas as pd
import re
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def clean_text(text):
    """Clean text by removing special characters, normalizing whitespace, and converting to lowercase."""
    if not isinstance(text, str):
        return ''
    # Remove special characters, keep alphanumeric and basic punctuation
    text = re.sub(r'[^\w\s.,!?]', '', text)
    # Normalize whitespace
    text = ' '.join(text.split())
    # Convert to lowercase
    return text.lower()


In [2]:

def row_with_min_words(x, threshold):
    """Check if text has more than the specified number of words."""
    return len(x.split()) > threshold

In [3]:


def preprocess_mental_health_data(input_file, output_file):
    """Preprocess mental health dataset for instruction-response training."""
    logger.info('Starting data preprocessing...')

    # Load dataset
    try:
        df = pd.read_csv(input_file)
        logger.info(f'Loaded dataset with {len(df)} rows.')
    except FileNotFoundError:
        logger.error(f'Input file {input_file} not found.')
        return

    # Handle missing values
    initial_len = len(df)
    df = df.dropna(subset=['Context', 'Response']).reset_index(drop=True)
    logger.info(f'Removed {initial_len - len(df)} rows with missing values.')

    # Clean Context and Response columns
    df['Context_clean'] = df['Context'].apply(clean_text)
    df['Response_clean'] = df['Response'].apply(clean_text)

    # Validate text lengths
    df['context_is_valid'] = df['Context_clean'].apply(lambda x: row_with_min_words(x, 5))
    df['response_is_valid'] = df['Response_clean'].apply(lambda x: row_with_min_words(x, 50))

    # Filter out invalid rows
    initial_len = len(df)
    df = df[df['context_is_valid'] & df['response_is_valid']].reset_index(drop=True)
    logger.info(f'Removed {initial_len - len(df)} rows with invalid context or response lengths.')

    # Calculate tokenized lengths for quality check
    df['context_token_count'] = df['Context_clean'].apply(lambda x: len(x.split()))
    df['response_token_count'] = df['Response_clean'].apply(lambda x: len(x.split()))

    # Remove rows with numeric-heavy responses
    df['has_numbers'] = df['Response_clean'].str.contains(r'\d+', regex=True, na=False)
    initial_len = len(df)
    df = df[~df['has_numbers']].reset_index(drop=True)
    logger.info(f'Removed {initial_len - len(df)} rows with numeric-heavy responses.')

    # Create instruction-response format for training
    df['instruction'] = df['Context_clean'].apply(lambda x: f'Provide a supportive response to the following mental health concern: {x}')
    df['response'] = df['Response_clean']

    # Save cleaned dataset
    df[['instruction', 'response']].to_csv(output_file, index=False)
    logger.info(f'Cleaned dataset saved to {output_file}.')

    # Log sample of cleaned data
    logger.info('Sample of cleaned dataset:')
    logger.info(df[['instruction', 'response']].head().to_string())

In [4]:
def main():
    input_file = './mental_health_dataset.csv'
    output_file = './cleaned_mental_health_dataset.csv'
    preprocess_mental_health_data(input_file, output_file)

In [5]:
main()

2025-05-19 15:27:10,368 - INFO - Starting data preprocessing...
2025-05-19 15:27:10,534 - INFO - Loaded dataset with 3000 rows.
2025-05-19 15:27:10,542 - INFO - Removed 0 rows with missing values.
2025-05-19 15:27:10,647 - INFO - Removed 0 rows with invalid context or response lengths.
2025-05-19 15:27:10,703 - INFO - Removed 0 rows with numeric-heavy responses.
2025-05-19 15:27:10,752 - INFO - Cleaned dataset saved to ./cleaned_mental_health_dataset.csv.
2025-05-19 15:27:10,752 - INFO - Sample of cleaned dataset:
2025-05-19 15:27:10,758 - INFO -                                                                                                                                   instruction                                                                                                                                                                                                                                                                                                                  