# Data Exploration for NeMo QA Chatbot

This notebook explores the data curation process for the NeMo QA Chatbot.

In [None]:
import os
import sys
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add parent directory to path
sys.path.append(os.path.abspath('..'))

# Import NeMo QA modules
from nemo_qa.curator.document_processor import process_documents
from nemo_qa.curator.qa_generator import generate_qa_pairs
from nemo_qa.curator.quality_filters import filter_qa_pairs

## Load and Explore Documents

Let's load the raw documents and explore their content.

In [None]:
# Replace with your data path
data_dir = '../data/raw'

# List documents
documents = []
for filename in os.listdir(data_dir):
    if filename.endswith('.json') or filename.endswith('.jsonl'):
        file_path = os.path.join(data_dir, filename)
        with open(file_path, 'r') as f:
            if filename.endswith('.json'):
                doc = json.load(f)
                documents.append(doc)
            else:
                for line in f:
                    doc = json.loads(line.strip())
                    documents.append(doc)

print(f'Loaded {len(documents)} documents')

## Document Statistics

Let's analyze the statistics of the documents.

In [None]:
# Calculate document statistics
doc_stats = pd.DataFrame({
    'document_id': [i for i in range(len(documents))],
    'text_length': [len(doc.get('text', '')) for doc in documents]
})

# Plot document length distribution
plt.figure(figsize=(10, 6))
sns.histplot(doc_stats['text_length'], bins=20)
plt.title('Document Length Distribution')
plt.xlabel('Text Length (characters)')
plt.ylabel('Count')
plt.show()

# Print statistics
print(f"Mean document length: {doc_stats['text_length'].mean():.2f} characters")
print(f"Median document length: {doc_stats['text_length'].median():.2f} characters")
print(f"Min document length: {doc_stats['text_length'].min()} characters")
print(f"Max document length: {doc_stats['text_length'].max()} characters")

## Process Documents

Now let's process the documents using the document processor.

In [None]:
# Set paths
processed_dir = '../data/processed'

# Define filters
filters = [
    {
        'name': 'length_filter',
        'min_length': 100,
        'max_length': 2000
    },
    {
        'name': 'language_filter',
        'languages': ['en']
    }
]

# Process documents
stats = process_documents(
    input_dir=data_dir,
    output_dir=processed_dir,
    filters=filters
)

print(f"Processed {stats['input_files']} documents to {stats['output_files']} documents")

## Generate Q&A Pairs

Now let's generate Q&A pairs from the processed documents.

In [None]:
# Set paths
qa_dir = '../data/qa_pairs'

# Generate Q&A pairs
qa_pairs = generate_qa_pairs(
    model_path='models/base/llama3-8b',
    input_dir=processed_dir,
    output_dir=qa_dir,
    question_template="Generate a detailed question based on this text: {text}",
    answer_template="Answer the following question based on this text:\nQuestion: {question}\nText: {text}\nAnswer:"
)

print(f"Generated {len(qa_pairs)} Q&A pairs")

## Analyze Q&A Pairs

Let's analyze the generated Q&A pairs.

# Convert to DataFrame
qa_df = pd.DataFrame(qa_pairs)

# Add length columns
qa_df['question_length'] = qa_df['question'].apply(len)
qa_df['answer_length'] = qa_df['answer'].apply(len)

# Plot question and answer length distributions
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

sns.histplot(qa_df['question_length'], bins=20, ax=ax[0])
ax[0].set_title('Question Length Distribution')
ax[0].set_xlabel('Question Length (characters)')
ax[0].set_ylabel('Count')

sns.histplot(qa_df['answer_length'], bins=20, ax=ax[1])
ax[1].set_title('Answer Length Distribution')
ax[1].set_xlabel('Answer Length (characters)')
ax[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Print statistics
print(f"Mean question length: {qa_df['question_length'].mean():.2f} characters")
print(f"Mean answer length: {qa_df['answer_length'].mean():.2f} characters")

## Filter Q&A Pairs

Now let's filter the Q&A pairs based on quality.

In [None]:
# Set paths
filtered_dir = '../data/qa_pairs/filtered'

# Filter Q&A pairs
filtered_pairs = filter_qa_pairs(
    qa_pairs=qa_pairs,
    output_dir=filtered_dir,
    min_question_length=10,
    max_question_length=200,
    min_answer_length=50,
    max_answer_length=1000,
    min_relevance_score=0.3,
    diversity_clusters=10
)

print(f"Filtered from {len(qa_pairs)} to {len(filtered_pairs)} Q&A pairs")

## Create Dataset Splits

Finally, let's create the dataset splits for training, validation, and testing.

In [None]:
from sklearn.model_selection import train_test_split

# Create dataset splits
train_pairs, temp_pairs = train_test_split(
    filtered_pairs, test_size=0.2, random_state=42
)
val_pairs, test_pairs = train_test_split(
    temp_pairs, test_size=0.5, random_state=42
)

print(f"Training set: {len(train_pairs)} pairs")
print(f"Validation set: {len(val_pairs)} pairs")
print(f"Test set: {len(test_pairs)} pairs")

# Save dataset splits
datasets_dir = '../data/datasets'
os.makedirs(datasets_dir, exist_ok=True)

with open(os.path.join(datasets_dir, 'train.jsonl'), 'w') as f:
    for pair in train_pairs:
        f.write(json.dumps(pair) + '\n')

with open(os.path.join(datasets_dir, 'val.jsonl'), 'w') as f:
    for pair in val_pairs:
        f.write(json.dumps(pair) + '\n')

with open(os.path.join(datasets_dir, 'test.jsonl'), 'w') as f:
    for pair in test_pairs:
        f.write(json.dumps(pair) + '\n')

print(f"Dataset splits saved to {datasets_dir}")