In [2]:
import sys
print(sys.executable)

/Users/pranavpant/Desktop/code /RAG/venv/bin/python


In [3]:
import os
import json
import time
import random
import pandas as pd
import numpy as np
import wikipediaapi
from datasets import load_dataset, concatenate_datasets
from tqdm import tqdm
from datetime import datetime
import urllib.parse

# Define Paths
DATA_DIR = "./data/raw"
LOGS_DIR = "./data/logs"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)

WIKI_USER_AGENT = 'RAG_Portfolio_Builder/1.0 (your_email@example.com)'
print(f"‚úÖ Environment Ready.")

‚úÖ Environment Ready.


In [3]:
# 1. Load Both Splits
print("üì• Loading SQuAD v2.0 (Train & Validation)...")
ds_train = load_dataset("squad_v2", split="train")
ds_valid = load_dataset("squad_v2", split="validation")

# 2. Convert to Pandas for easier tagging
df_train = ds_train.to_pandas()
df_valid = ds_valid.to_pandas()

# 3. Add 'split' column
df_train['split'] = 'train'
df_valid['split'] = 'validation'

# 4. Concatenate
df_combined = pd.concat([df_train, df_valid], ignore_index=True)

print(f"‚úÖ Merged Dataset Created.")
print(f"   - Train Size: {len(df_train)}")
print(f"   - Valid Size: {len(df_valid)}")
print(f"   - Total Q&A Pairs: {len(df_combined)}")

# 5. Extract Unique Titles (The Knowledge Base Target)
# We aggregate ground truth contexts from both splits
squad_metadata = {}

print("üîÑ Organizing Metadata by Article Title...")
for index, row in tqdm(df_combined.iterrows(), total=len(df_combined), desc="Processing Rows"):
    title = row['title']
    
    if title not in squad_metadata:
        squad_metadata[title] = {
            "squad_contexts": set(),
            "qas_count": 0
        }
    
    squad_metadata[title]["squad_contexts"].add(row['context'])
    squad_metadata[title]["qas_count"] += 1

# Convert sets to lists
for title in squad_metadata:
    squad_metadata[title]["squad_contexts"] = list(squad_metadata[title]["squad_contexts"])

unique_titles = list(squad_metadata.keys())
print(f"üìö Found {len(unique_titles)} unique articles across all splits.")

üì• Loading SQuAD v2.0 (Train & Validation)...
‚úÖ Merged Dataset Created.
   - Train Size: 130319
   - Valid Size: 11873
   - Total Q&A Pairs: 142192
üîÑ Organizing Metadata by Article Title...


Processing Rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 142192/142192 [00:01<00:00, 114998.61it/s]

üìö Found 477 unique articles across all splits.





In [4]:
# Initialize Wikipedia API
wiki = wikipediaapi.Wikipedia(
    user_agent=WIKI_USER_AGENT,
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)

full_knowledge_base = []
process_logs = []

print(f"üöÄ Starting Extraction for {len(unique_titles)} articles...")
print("‚òïÔ∏è Estimated time: 15-20 minutes.")

for i, title in enumerate(tqdm(unique_titles, desc="Fetching Articles", unit="article")):
    
    start_time = time.time()
    
    # Initialize Defaults
    status = "FAILED"
    url = "N/A"
    word_count = 0
    char_count = 0
    est_chunks = 0
    squad_qas = 0
    error_details = "None"
    
    # We will use this to track if we had to use the fallback method
    final_display_title = title 
    
    try:
        # 1. Retrieve Metadata
        if title in squad_metadata:
            squad_qas = squad_metadata[title]["qas_count"]
        
        # 2. Attempt 1: Fetch with original SQuAD title
        page = wiki.page(title)
        
        # 3. Attempt 2: Fallback (URL Decode) if Attempt 1 failed
        if not page.exists():
            decoded_title = urllib.parse.unquote(title)
            
            # Only try again if decoding actually changed the string
            if decoded_title != title:
                # Try fetching with the clean title
                fallback_page = wiki.page(decoded_title)
                if fallback_page.exists():
                    page = fallback_page
                    final_display_title = decoded_title # Update for display
                    status = "SUCCESS_FALLBACK" # Mark that we fixed it
                    print(f"Title: {title}, Decoded: {decoded_title} and Status: {status}")
        
        # 4. Process Page (Whether from Attempt 1 or Attempt 2)
        if page.exists():
            if status == "FAILED": status = "SUCCESS"
            
            url = page.fullurl
            text_content = page.text
            
            # Calculate Metrics
            word_count = len(text_content.split())
            char_count = len(text_content)
            est_chunks = max(1, word_count // 500)
            
            doc_entry = {
                "id": title,               # CRITICAL: Keep original encoded ID to match SQuAD Questions!
                "title": final_display_title, # Human-readable title
                "source_url": url,
                "full_text": text_content,
                "summary": page.summary,
                "squad_ground_truth": {
                    "original_contexts": squad_metadata.get(title, {}).get("squad_contexts", []),
                    "qas_count": squad_qas
                },
                "fetched_at": datetime.now().isoformat()
            }
            full_knowledge_base.append(doc_entry)
        else:
            status = "NOT_FOUND"
            error_details = "Page.exists() returned False (tried raw & decoded)"

    except Exception as e:
        status = "ERROR"
        error_details = str(e)
        print(f"Error fetching {title}: {e}")
    
    # Calculate Duration
    end_time = time.time()
    duration = round(end_time - start_time, 2)

    # Append Detailed Log
    process_logs.append({
        "title": title, # Log the requested title
        "status": status,
        "url": url,
        "word_count": word_count,
        "char_count": char_count,
        "estimated_chunks": est_chunks,
        "squad_qas_available": squad_qas,
        "fetch_duration_sec": duration,
        "error_details": error_details
    })

    # Polite Sleep
    time.sleep(1.5) 

    # Checkpoint every 50
    if (i + 1) % 50 == 0:
        with open(f"{DATA_DIR}/knowledge_base_CHECKPOINT.json", "w", encoding='utf-8') as f:
            json.dump(full_knowledge_base, f, ensure_ascii=False, indent=4)

# Final Save JSON
kb_path = f"{DATA_DIR}/knowledge_base_raw.json"
with open(kb_path, "w", encoding='utf-8') as f:
    json.dump(full_knowledge_base, f, ensure_ascii=False, indent=4)

# Final Save Logs (CSV)
df_logs = pd.DataFrame(process_logs)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = f"{LOGS_DIR}/extraction_log_combined_{timestamp}.csv"
df_logs.to_csv(log_path, index=False)

print("="*40)
print(f"‚ú® Knowledge Base Saved: {kb_path}")
print(f"üìã Detailed Logs Saved:  {log_path}")
print("="*40)
# Show success vs fallback stats
print(df_logs['status'].value_counts())
print("-" * 40)
print(df_logs[['title', 'status', 'word_count', 'fetch_duration_sec']].head())

üöÄ Starting Extraction for 477 articles...
‚òïÔ∏è Estimated time: 15-20 minutes.


Fetching Articles:  37%|‚ñà‚ñà‚ñà‚ñã      | 175/477 [05:47<10:55,  2.17s/article]

Title: Bill_%26_Melinda_Gates_Foundation, Decoded: Bill_&_Melinda_Gates_Foundation and Status: SUCCESS_FALLBACK


Fetching Articles:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 241/477 [08:00<07:42,  1.96s/article]

Title: Molotov%E2%80%93Ribbentrop_Pact, Decoded: Molotov‚ÄìRibbentrop_Pact and Status: SUCCESS_FALLBACK


Fetching Articles:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 289/477 [09:36<06:20,  2.02s/article]

Title: St._John%27s,_Newfoundland_and_Labrador, Decoded: St._John's,_Newfoundland_and_Labrador and Status: SUCCESS_FALLBACK


Fetching Articles:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 347/477 [11:34<04:34,  2.11s/article]

Title: Seven_Years%27_War, Decoded: Seven_Years'_War and Status: SUCCESS_FALLBACK


Fetching Articles:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 396/477 [13:12<02:38,  1.96s/article]

Title: Kievan_Rus%27, Decoded: Kievan_Rus' and Status: SUCCESS_FALLBACK


Fetching Articles:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 417/477 [13:55<02:03,  2.05s/article]

Title: Bras%C3%ADlia, Decoded: Bras√≠lia and Status: SUCCESS_FALLBACK


Fetching Articles:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 421/477 [14:02<01:49,  1.95s/article]

Title: Jehovah%27s_Witnesses, Decoded: Jehovah's_Witnesses and Status: SUCCESS_FALLBACK


Fetching Articles:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 424/477 [14:09<01:47,  2.03s/article]

Title: Financial_crisis_of_2007%E2%80%9308, Decoded: Financial_crisis_of_2007‚Äì08 and Status: SUCCESS_FALLBACK


Fetching Articles:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 430/477 [14:20<01:31,  1.95s/article]

Title: Saint_Barth%C3%A9lemy, Decoded: Saint_Barth√©lemy and Status: SUCCESS_FALLBACK


Fetching Articles: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 477/477 [15:54<00:00,  2.00s/article]

‚ú® Knowledge Base Saved: ./data/raw/knowledge_base_raw.json
üìã Detailed Logs Saved:  ./data/logs/extraction_log_combined_20251215_185559.csv
status
SUCCESS             468
SUCCESS_FALLBACK      9
Name: count, dtype: int64
----------------------------------------
                                            title   status  word_count  \
0                                         Beyonc√©  SUCCESS        9446   
1                                 Fr√©d√©ric_Chopin  SUCCESS       10662   
2  Sino-Tibetan_relations_during_the_Ming_dynasty  SUCCESS       10961   
3                                            IPod  SUCCESS        5999   
4          The_Legend_of_Zelda:_Twilight_Princess  SUCCESS        5065   

   fetch_duration_sec  
0                0.54  
1                0.51  
2                0.38  
3                0.58  
4                0.38  





In [5]:
# --- Helper: Fix for "ndarray is not JSON serializable" ---
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, (np.bool_, np.integer)):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        return super(NumpyEncoder, self).default(obj)
# ----------------------------------------------------------

print("üîç Building Master Evaluation Set...")

# 1. Analyze Knowledge Base
# We load the KB to filter our Q&A, but also to calculate stats
if os.path.exists(kb_path):
    with open(kb_path, "r", encoding='utf-8') as f:
        kb_data = json.load(f)
    
    # Calculate KB Metrics
    kb_titles = {doc['id'] for doc in kb_data}
    kb_total_articles = len(kb_data)
    # Count total words across all articles in the KB
    kb_total_words = sum(len(doc.get('full_text', "").split()) for doc in kb_data)
    # Get file size in MB
    kb_size_mb = os.path.getsize(kb_path) / (1024 * 1024)
else:
    print(f"‚ö†Ô∏è Warning: Knowledge base not found at {kb_path}. Master set will be empty.")
    kb_data = []
    kb_titles = set()
    kb_total_articles = 0
    kb_total_words = 0
    kb_size_mb = 0

# 2. Filter the Combined DataFrame
# Keep only questions where the corresponding article exists in our downloaded KB
df_final_eval = df_combined[df_combined['title'].isin(kb_titles)].copy()

# 3. Add 'is_impossible' helper
df_final_eval['is_impossible'] = df_final_eval['answers'].apply(lambda x: len(x['text']) == 0)

# 4. Convert ALL columns to dictionary
eval_data = df_final_eval.to_dict(orient='records')

# 5. Save
eval_path = f"{DATA_DIR}/squad_eval_set_all.json"
with open(eval_path, "w", encoding='utf-8') as f:
    json.dump(eval_data, f, ensure_ascii=False, indent=4, cls=NumpyEncoder)

# 6. Calculate Final Exam Metrics
exam_total_q = len(eval_data)
exam_train_q = len([x for x in eval_data if x.get('split')=='train'])
exam_val_q = len([x for x in eval_data if x.get('split')=='validation'])
exam_unique_titles = len(set(x['title'] for x in eval_data))
exam_size_mb = os.path.getsize(eval_path) / (1024 * 1024) if os.path.exists(eval_path) else 0

# 7. Print Detailed Report
print("="*60)
print(f"üéâ MASTER DATASET GENERATION COMPLETE")
print("="*60)

print(f"üìö KNOWLEDGE BASE METRICS")
print(f"   - File Path:      {kb_path}")
print(f"   - Size on Disk:   {kb_size_mb:.2f} MB")
print(f"   - Total Articles: {kb_total_articles}")
print(f"   - Total Words:    {kb_total_words:,.0f} words")
print(f"   - Unique Titles:  {len(kb_titles)}")

print("-" * 40)

print(f"üìù MASTER EXAM (EVALUATION SET) METRICS")
print(f"   - File Path:      {eval_path}")
print(f"   - Size on Disk:   {exam_size_mb:.2f} MB")
print(f"   - Total Q&A Pairs:{exam_total_q}")
print(f"   - Covered Titles: {exam_unique_titles} (Matched against KB)")
print(f"     ‚Ä¢ Train Split:  {exam_train_q}")
print(f"     ‚Ä¢ Val Split:    {exam_val_q}")

print("="*60)

üîç Building Master Evaluation Set...
üéâ MASTER DATASET GENERATION COMPLETE
üìö KNOWLEDGE BASE METRICS
   - File Path:      ./data/raw/knowledge_base_raw.json
   - Size on Disk:   41.24 MB
   - Total Articles: 477
   - Total Words:    4,087,337 words
   - Unique Titles:  477
----------------------------------------
üìù MASTER EXAM (EVALUATION SET) METRICS
   - File Path:      ./data/raw/squad_eval_set_all.json
   - Size on Disk:   157.80 MB
   - Total Q&A Pairs:142192
   - Covered Titles: 477 (Matched against KB)
     ‚Ä¢ Train Split:  130319
     ‚Ä¢ Val Split:    11873


In [6]:
def validate_dataset(file_path, dataset_name, expected_columns, min_records=1):
    print(f"üî¨ INSPECTING: {dataset_name}")
    print(f"   Path: {file_path}")
    
    # 1. Check File Existence
    if not os.path.exists(file_path):
        print(f"   ‚ùå FATAL: File not found at {file_path}")
        return False
    
    # 2. Check JSON Validity
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f"   ‚ùå FATAL: Invalid JSON format. {str(e)}")
        return False
    except Exception as e:
        print(f"   ‚ùå FATAL: Error reading file. {str(e)}")
        return False
        
    # 3. Check Data Type (Should be a List of Dictionaries)
    if not isinstance(data, list):
        print(f"   ‚ùå FAIL: Expected a list of records, got {type(data)}")
        return False
        
    # 4. Check Volume
    count = len(data)
    print(f"   üìä Found {count} records.")
    if count < min_records:
        print(f"   ‚ö†Ô∏è WARNING: Dataset is suspiciously small (< {min_records} records).")
    
    # 5. Schema & Content Validation
    missing_keys = set()
    null_values = set()
    empty_strings = set()
    
    # Check a sample (or all if small) to avoid freezing on massive datasets
    sample_size = min(len(data), 5000) 
    sample_indices = random.sample(range(len(data)), sample_size)
    
    for i in sample_indices:
        item = data[i]
        
        # Check for missing keys
        for col in expected_columns:
            if col not in item:
                missing_keys.add(col)
                continue
            
            # Check for nulls
            val = item[col]
            if val is None:
                null_values.add(col)
            
            # Check for empty strings (critical for text/context)
            if isinstance(val, str) and len(val.strip()) == 0:
                empty_strings.add(col)
            
            # Special Check: Answers in SQuAD
            if col == 'answers' and isinstance(val, dict):
                if 'text' not in val or not isinstance(val['text'], list):
                     print(f"   ‚ùå FAIL: Record {i} has invalid 'answers' format.")
                     return False

    # 6. Final Report
    is_valid = True
    
    if missing_keys:
        print(f"   ‚ùå FAIL: Missing columns in some records: {missing_keys}")
        is_valid = False
    
    if null_values:
        print(f"   ‚ö†Ô∏è WARNING: Found NULL values in columns: {null_values}")
    
    if empty_strings:
        print(f"   ‚ö†Ô∏è WARNING: Found EMPTY STRINGS in columns: {empty_strings}")
        # Context being empty is usually critical
        if 'context' in empty_strings or 'full_text' in empty_strings:
            print(f"   ‚ùå CRITICAL: Some records have empty context/text!")
            is_valid = False

    if is_valid:
        print(f"   ‚úÖ SUCCESS: {dataset_name} looks healthy and ready for RAG.")
    else:
        print(f"   üõë FAILED: {dataset_name} needs fixing.")
    
    print("-" * 50)
    return is_valid

# ==========================================
# RUN CHECKS
# ==========================================

# 1. Check Wikipedia Knowledge Base
# UPDATED: Changed 'text' -> 'full_text' and 'url' -> 'source_url'
kb_valid = validate_dataset(
    file_path=kb_path, 
    dataset_name="Wikipedia Knowledge Base",
    expected_columns=['id', 'title', 'full_text', 'source_url'] 
)

# 2. Check SQuAD Evaluation Set
# UPDATED: Added 'is_impossible' to check list
squad_valid = validate_dataset(
    file_path=eval_path,
    dataset_name="SQuAD Master Eval Set",
    expected_columns=['id', 'title', 'context', 'question', 'answers', 'split', 'is_impossible']
)

if kb_valid and squad_valid:
    print("\nüöÄ ALL SYSTEMS GO! Datasets are verified.")
else:
    print("\n‚ö†Ô∏è ISSUES DETECTED. Please review the errors above.")

üî¨ INSPECTING: Wikipedia Knowledge Base
   Path: ./data/raw/knowledge_base_raw.json
   üìä Found 477 records.
   ‚úÖ SUCCESS: Wikipedia Knowledge Base looks healthy and ready for RAG.
--------------------------------------------------
üî¨ INSPECTING: SQuAD Master Eval Set
   Path: ./data/raw/squad_eval_set_all.json
   üìä Found 142192 records.
   ‚úÖ SUCCESS: SQuAD Master Eval Set looks healthy and ready for RAG.
--------------------------------------------------

üöÄ ALL SYSTEMS GO! Datasets are verified.


In [4]:
# Files
kb_path = "./data/raw/knowledge_base_raw.json"
eval_path = "./data/raw/squad_eval_set_all.json"
output_path = "./data/raw/knowledge_base_augmented.json"

print("üîÑ Starting Knowledge Base Augmentation...")

# 1. Load Datasets
with open(kb_path, "r", encoding="utf-8") as f:
    kb_data = json.load(f)

with open(eval_path, "r", encoding="utf-8") as f:
    squad_data = json.load(f)

# 2. Map SQuAD Contexts by Title
# We want to know: "For the article 'Beyonc√©', what are all the paragraphs SQuAD expects?"
print("   ‚Ä¢ Mapping SQuAD contexts...")
squad_contexts_map = {}

for row in squad_data:
    title = row['title']
    context = row['context']
    
    if title not in squad_contexts_map:
        squad_contexts_map[title] = set()
    
    squad_contexts_map[title].add(context)

# 3. Augment Knowledge Base
# We iterate through our downloaded articles and inject missing contexts
stats = {
    "articles_processed": 0,
    "contexts_checked": 0,
    "contexts_missing": 0,
    "contexts_added": 0
}

augmented_kb = []

print(f"   ‚Ä¢ Augmenting {len(kb_data)} articles...")

for article in tqdm(kb_data):
    title = article['id'] # We use 'id' because that's what we matched SQuAD with earlier
    current_text = article['full_text']
    
    # Check if we have SQuAD contexts for this title
    if title in squad_contexts_map:
        # Get all unique paragraphs SQuAD expects for this title
        required_contexts = squad_contexts_map[title]
        
        missing_contexts = []
        
        for ctx in required_contexts:
            stats['contexts_checked'] += 1
            
            # NORMALIZATION CHECK
            # We strip whitespace to avoid false negatives due to formatting
            if ctx.strip() not in current_text:
                missing_contexts.append(ctx)
                stats['contexts_missing'] += 1
        
        # INJECTION
        if missing_contexts:
            # We append missing contexts to the end of the article
            # We add a clear separator so we know this is historical data
            injection_text = "\n\n" + "\n\n".join(missing_contexts)
            article['full_text'] += injection_text
            stats['contexts_added'] += len(missing_contexts)
            
            # Optional: Add a metadata flag saying we modified this
            article['augmented'] = True
            article['augmented_count'] = len(missing_contexts)
    
    augmented_kb.append(article)
    stats['articles_processed'] += 1

# 4. Save
print(f"   ‚Ä¢ Saving Augmented KB to {output_path}...")
with open(output_path, "w", encoding='utf-8') as f:
    json.dump(augmented_kb, f, ensure_ascii=False, indent=4)

# 5. Report
print("="*50)
print("üöÄ AUGMENTATION COMPLETE")
print("="*50)
print(f"üìö Articles Processed: {stats['articles_processed']}")
print(f"üîç Contexts Checked:   {stats['contexts_checked']}")
print(f"‚ö†Ô∏è Contexts Missing:   {stats['contexts_missing']} (These would have caused RAG failure!)")
print(f"‚úÖ Contexts Injected:  {stats['contexts_added']}")
print("="*50)
print(f"The file '{output_path}' now contains the definitive")
print("source text tailored for your SQuAD evaluation.")

üîÑ Starting Knowledge Base Augmentation...
   ‚Ä¢ Mapping SQuAD contexts...
   ‚Ä¢ Augmenting 477 articles...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 477/477 [00:00<00:00, 2291.05it/s]


   ‚Ä¢ Saving Augmented KB to ./data/raw/knowledge_base_augmented.json...
üöÄ AUGMENTATION COMPLETE
üìö Articles Processed: 477
üîç Contexts Checked:   20233
‚ö†Ô∏è Contexts Missing:   18337 (These would have caused RAG failure!)
‚úÖ Contexts Injected:  18337
The file './data/raw/knowledge_base_augmented.json' now contains the definitive
source text tailored for your SQuAD evaluation.
