# Model Inference
This notebook loads the augmented questions and runs inference using over 20 HuggingFace language models.
All model outputs (with metadata) are saved in `data/model_outputs.csv` for later analysis.


In [7]:
import sys
import os
import csv
from tqdm import tqdm
from src.hf_model_inference import load_model_pipeline, query_model

# Debug: Check for conflicting pandas.py files
print("DEBUG - sys.path:", sys.path)
for path in sys.path:
    pandas_file = os.path.join(path, 'pandas.py')
    pandas_lib = os.path.join(path, 'pandas', '__init__.py')
    if os.path.exists(pandas_file):
        print(f"WARNING - Found a conflicting pandas.py file at: {pandas_file}")
    if os.path.exists(pandas_lib):
        print(f"DEBUG - Found pandas library at: {pandas_lib}")

# Rest of the script
augmented_questions = []
csv_file_path = 'data/augmented_questions.csv'
if not os.path.exists(csv_file_path):
    raise FileNotFoundError(f"CSV file not found at: {csv_file_path}")

with open(csv_file_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        row['error_count'] = int(row['error_count'])
        augmented_questions.append(row)

print(f"Loaded augmented dataset: {len(augmented_questions)} rows")

model_names = [
    "gpt2",
    "EleutherAI/gpt-neo-125M",
    "EleutherAI/gpt-neo-1.3B",
    "EleutherAI/gpt-neo-2.7B",
    "microsoft/DialoGPT-small",
    "microsoft/DialoGPT-medium",
    "microsoft/DialoGPT-large",
    "facebook/blenderbot-400M-distill",
    "facebook/blenderbot-90M",
    "gpt2-medium",
    "gpt2-large",
    "distilgpt2",
    "t5-small",
    "t5-base",
    "t5-large",
    "bigscience/bloom-560m",
    "bigscience/bloom-1b1",
    "bigscience/bloom-3b",
    "EleutherAI/gpt-j-6B",
    "ctrl"
]

model_pipelines = {}
for m in model_names:
    try:
        print(f"Loading model: {m}")
        model_pipelines[m] = load_model_pipeline(m)
    except Exception as e:
        print(f"Failed to load model {m}: {e}")

outputs = []
for row in tqdm(augmented_questions, total=len(augmented_questions)):
    question_variant = row['variant_question']
    original_question = row['original_question']
    error_count = row['error_count']
    for model_name, pipe in model_pipelines.items():
        try:
            output_text = query_model(pipe, question_variant)
        except Exception as e:
            output_text = f"Error: {e}"
        outputs.append({
            'original_question': original_question,
            'variant_question': question_variant,
            'error_count': error_count,
            'model_name': model_name,
            'model_output': output_text
        })

output_csv_path = 'data/model_outputs.csv'
with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
    fieldnames = ['original_question', 'variant_question', 'error_count', 'model_name', 'model_output']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for output in outputs:
        writer.writerow(output)

print(f"Model outputs saved at {output_csv_path}")

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)