# 01 - Data Exploration

Explore the Qatar government services dataset.

In [None]:
import sys
sys.path.append('..')

from pathlib import Path
from src.preprocessing import ArabicPreprocessor
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

## Load Documents

In [None]:
preprocessor = ArabicPreprocessor()
data_dir = Path('../data')

documents = preprocessor.load_all_documents(data_dir)
print(f"Loaded {len(documents)} documents")

## Document Statistics

In [None]:
# Create DataFrame
df = pd.DataFrame([
    {
        'category': doc['category'],
        'filename': doc['filename'],
        'length': len(doc['text']),
        'arabic_chars': len([c for c in doc['text'] if '\u0600' <= c <= '\u06FF'])
    }
    for doc in documents
])

df.head()

In [None]:
# Category distribution
print("Documents per category:")
print(df['category'].value_counts())

In [None]:
# Visualize category distribution
plt.figure(figsize=(10, 6))
df['category'].value_counts().plot(kind='bar')
plt.title('Documents per Category')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Document length statistics
print("\nDocument length statistics:")
print(df['length'].describe())

In [None]:
# Visualize document lengths
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['length'], bins=20, edgecolor='black')
plt.title('Document Length Distribution')
plt.xlabel('Length (characters)')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
df.groupby('category')['length'].mean().plot(kind='bar')
plt.title('Average Document Length by Category')
plt.xlabel('Category')
plt.ylabel('Average Length')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## Sample Documents

In [None]:
# Show sample document
sample_doc = documents[0]
print(f"Category: {sample_doc['category']}")
print(f"Filename: {sample_doc['filename']}")
print(f"Length: {len(sample_doc['text'])} characters")
print(f"\nFirst 500 characters:\n{sample_doc['text'][:500]}...")

## Next Steps

1. Experiment with chunking strategies (notebook 02)
2. Test retrieval performance (notebook 03)