# CQ-Gens Task
This notebook demonstrates a system to generate critical questions (CQs) for interventions using advanced NLP techniques. The workflow includes:
- Loading and normalizing a dataset.
- Utilizing LLaMA and SentenceTransformer models for question generation and ranking.
- Adaptive generation with meta-evaluation.
- External evaluation of predictions.

Let's start by setting up the environment and loading necessary libraries.

# Load model: deep-hermes-3-llama-3-8b-preview.Q4_K_M.gguf

In [None]:
!pip install llama-cpp-python --no-cache-dir
!pip install evaluate --no-cache-dir

# Import Libraries and Setup (Code)

In [None]:
import json
import subprocess
import logging
import re
import nltk
from nltk.tokenize import sent_tokenize
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer, util

# Download required NLTK data if needed
nltk.download('punkt', quiet=True)

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

## Dataset Loading

Load the sample dataset from a JSON file stored in Kaggle's input directory. Error handling ensures robustness if the file is missing or corrupted.

In [None]:
# Load dataset
try:
    with open("/kaggle/input/dataset-cri/sample.json", "r") as f:
        sample_data = json.load(f)
    logging.info("Dataset loaded successfully.")
except Exception as e:
    logging.error(f"Failed to load dataset: {e}")
    sample_data = {}

## Text Normalization

This section defines and applies an enhanced punctuation normalization function using NLTK's sentence tokenization. It ensures proper sentence endings and checks punctuation ratio.

In [None]:
def enhanced_normalize_text(text, min_punctuation_ratio=0.4):
    """
    Normalize text by using sentence tokenization to ensure proper punctuation.
    Adds a period at the end of sentences if missing.
    """
    if not text:
        return text
    sentences = sent_tokenize(text)
    punctuation_chars = set(".,;:!?")
    norm_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence[-1] not in punctuation_chars:
            sentence += "."
        norm_sentences.append(sentence)
    normalized_text = " ".join(norm_sentences)
    count = sum(1 for char in normalized_text if char in punctuation_chars)
    ratio = count / len(normalized_text) if normalized_text else 0
    if ratio < min_punctuation_ratio:
        logging.warning(f"Low punctuation ratio ({ratio}) detected after normalization.")
    return normalized_text

# Apply normalization to interventions
for key, data in sample_data.items():
    sample_data[key]["intervention"] = enhanced_normalize_text(data.get("intervention", ""))

## Model Loading

Load the LLaMA generation model and SentenceTransformer models for similarity, coherence, and relevance scoring. Error handling ensures the pipeline stops if models fail to load.

In [None]:
# Load generation model
gen_model_path = "/kaggle/input/model-llama/DeepHermes-3-Llama-3-8B-q4.gguf"
try:
    gen_model = Llama(model_path=gen_model_path, n_ctx=4096)
    logging.info("Generation model loaded successfully.")
except Exception as e:
    logging.error(f"Error loading generation model: {e}")
    raise

# Load SentenceTransformer models
try:
    sim_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    coherence_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    relevance_model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
    logging.info("SentenceTransformer models loaded successfully.")
except Exception as e:
    logging.error(f"Error loading SentenceTransformer models: {e}")
    raise

## Helper Functions

Define utility functions for text processing, question extraction, and quality checks to support the generation pipeline.

In [None]:
def extract_numbered_questions(text):
    """Extracts numbered sentences ending with a question mark using regex."""
    pattern = re.compile(r"(?m)^\s*(\d+)\.\s*(.+\?)\s*$")
    matches = pattern.findall(text)
    if matches:
        return [f"{num}. {q.strip()}" for num, q in matches]
    lines = text.split("\n")
    return [line.strip() for line in lines if line.strip().endswith("?")]

def is_interrogative(sentence):
    """Checks if a sentence is interrogative (ends with a question mark)."""
    return sentence.strip().endswith("?")

def post_process_cqs(cqs):
    """Validate and clean generated critical questions."""
    processed = [cq for cq in cqs if re.match(r"^\d+\.\s+", cq) and len(cq.split()) > 5 and is_interrogative(cq)]
    if not processed:
        joined = "\n".join(cqs)
        processed = extract_numbered_questions(joined)
    return processed if processed else cqs

def heuristic_quality_check(cqs):
    """Check that questions are diverse (at least two distinct questions)."""
    if len(cqs) < 2:
        return False
    words_list = [set(cq.lower().split()) for cq in cqs]
    for i in range(len(words_list)):
        for j in range(i+1, len(words_list)):
            overlap = words_list[i].intersection(words_list[j])
            if len(overlap) / min(len(words_list[i]), len(words_list[j])) > 0.6:
                return False
    return True

## Question Generation

Implement functions to generate critical questions using few-shot prompting, rank them with semantic models, and perform meta-evaluation.

In [None]:
def generate_cqs(text, variation=0, temperature=0.8):
    """Generate exactly three critical questions with few-shot examples."""
    few_shot_example = "Example:\n1. How does the argument define its key terms?\n2. What assumptions are being made without evidence?\n3. How might alternative perspectives challenge the argument?\n\n"
    prompts = [
        f"Read the following argument:\n{text}\n\n{few_shot_example}Now, generate exactly three critical questions... Start with '1.'",
        f"Consider the argument below:\n{text}\n\n{few_shot_example}Produce exactly three incisive critical questions... Begin with '1.'"
    ]
    prompt = prompts[variation % len(prompts)]
    try:
        output = gen_model(prompt, max_tokens=250, temperature=temperature, top_p=0.9, stop=["4."])
        generated_text = output["choices"][0]["text"].strip()
        cqs = extract_numbered_questions(generated_text)
        cqs = post_process_cqs(cqs)
        logging.info(f"Generated CQs: {cqs}")
        return cqs[:3] if cqs else []
    except Exception as e:
        logging.error(f"Error generating CQs for '{text[:50]}...': {e}")
        return []

def rank_cqs(intervention, cqs):
    """Rank CQs using similarity, coherence, and relevance scores."""
    if not cqs:
        return []
    try:
        intervention_emb_sim = sim_model.encode(intervention, convert_to_tensor=True)
        cq_embs_sim = sim_model.encode(cqs, convert_to_tensor=True)
        sim_scores = [util.pytorch_cos_sim(intervention_emb_sim, cq_emb)[0].item() for cq_emb in cq_embs_sim]
        
        intervention_emb_coh = coherence_model.encode(intervention, convert_to_tensor=True)
        cq_embs_coh = coherence_model.encode(cqs, convert_to_tensor=True)
        coherence_scores = [float(util.pytorch_cos_sim(intervention_emb_coh, coh_emb)[0].item()) for coh_emb in cq_embs_coh]

        intervention_emb_rel = relevance_model.encode(intervention, convert_to_tensor=True)
        cq_embs_rel = relevance_model.encode(cqs, convert_to_tensor=True)
        relevance_scores = [float(util.pytorch_cos_sim(intervention_emb_rel, rel_emb)[0].item()) for rel_emb in cq_embs_rel]

        final_scores = [0.4 * sim + 0.3 * coh + 0.3 * rel for sim, coh, rel in zip(sim_scores, coherence_scores, relevance_scores)]
        ranked = sorted(zip(final_scores, cqs), key=lambda x: x[0], reverse=True)
        ranked_cqs = [cq for score, cq in ranked]
        logging.info(f"Ranked CQs: {ranked_cqs}")
        return ranked_cqs[:3]
    except Exception as e:
        logging.error(f"Error ranking CQs for '{intervention[:50]}...': {e}")
        return []

def meta_evaluate_cqs(intervention, cqs):
    """Evaluate CQ effectiveness with a confidence score (1-5)."""
    if not cqs:
        return False
    eval_prompt = f"Review the argument: {intervention}\n\nCritical Questions:\n" + "\n".join([f"- {cq}" for cq in cqs]) + "\n\nScore (1-5) how effectively these questions challenge the argument. Provide only a number."
    try:
        eval_output = gen_model(eval_prompt, max_tokens=50, temperature=0.3, top_p=0.9)
        eval_response = eval_output["choices"][0]["text"].strip()
        score = float(eval_response) if eval_response.isdigit() else 0
        logging.info(f"Meta evaluation score: {score}")
        return score >= 3.5 and heuristic_quality_check(cqs)
    except Exception as e:
        logging.error(f"Error during meta evaluation for '{intervention[:50]}...': {e}")
        return False

## Adaptive Generation

Implement an adaptive loop to generate and refine CQs with dynamic parameter adjustments, returning the best set based on meta-evaluation.

In [None]:
def adaptive_generation(intervention, max_attempts=5):
    """Adaptive generation with parameter adjustments."""
    attempt = 0
    temperature = 0.8
    variation = 0
    best_cqs = []
    best_score = 0
    while attempt < max_attempts:
        logging.info(f"Attempt {attempt+1} for '{intervention[:50]}...' (temp={temperature}, variation={variation})")
        generated_cqs = generate_cqs(intervention, variation, temperature)
        if not generated_cqs:
            temperature = max(0.5, temperature - 0.1)
            variation += 1
            attempt += 1
            continue
        ranked_cqs = rank_cqs(intervention, generated_cqs)
        eval_prompt = f"Review: {intervention}\n\nQuestions:\n" + "\n".join([f"- {cq}" for cq in ranked_cqs]) + "\n\nScore (1-5):"
        try:
            eval_output = gen_model(eval_prompt, max_tokens=50, temperature=0.3, top_p=0.9)
            current_score = float(eval_output["choices"][0]["text"].strip()) if eval_output["choices"][0]["text"].strip().isdigit() else 0
        except Exception as e:
            current_score = 0
        if current_score >= 3.5 and heuristic_quality_check(ranked_cqs):
            return ranked_cqs
        if current_score > best_score:
            best_score = current_score
            best_cqs = ranked_cqs
        temperature = max(0.5, temperature - 0.1) if temperature > 0.5 else 0.8
        variation += 1 if temperature <= 0.5 else 0
        attempt += 1
    return best_cqs if best_cqs else ["Could not generate valid questions."]

## Prediction Generation

Process all interventions in the dataset to generate and store predictions in a JSON file.

In [None]:
def generate_predictions(dataset):
    """Generate predictions for all interventions."""
    predictions = {}
    for intervention_id, data in dataset.items():
        intervention_text = data["intervention"]
        best_cqs = adaptive_generation(intervention_text)
        if not best_cqs:
            logging.error(f"No questions for intervention {intervention_id}")
            best_cqs = ["Could not generate valid questions."]
        predictions[intervention_id] = {
            "intervention_id": intervention_id,
            "intervention": intervention_text,
            "dataset": data.get("dataset", "unknown"),
            "cqs": [{"id": i, "cq": cq} for i, cq in enumerate(best_cqs)]
        }
    return predictions

# Execute predictions
if __name__ == "__main__":
    predictions = generate_predictions(sample_data)

## Save Predictions and Evaluation

Save the generated predictions to a JSON file and run an external evaluation script to assess quality.

In [None]:
# Save predictions
try:
    with open("predictions.json", "w") as f:
        json.dump(predictions, f, indent=2)
    logging.info("Predictions saved successfully in 'predictions.json'.")
except Exception as e:
    logging.error(f"Error saving predictions: {e}")

# Run external evaluation
try:
    result = subprocess.run([
        "python", "/kaggle/input/evaluation/evaluation.py",
        "--input_path", "/kaggle/input/dataset-cri/sample.json",
        "--submission_path", "predictions.json",
        "--metric", "similarity",
        "--threshold", "0.6"
    ], check=True, capture_output=True, text=True)
    logging.info("Evaluation completed successfully.")
    print(result.stdout)
except Exception as e:
    logging.error(f"Evaluation error: {e}")

## Open the prediction file

In [None]:
import json

with open("predictions.json", "r") as f:
    predictions = json.load(f)

# Print the entire JSON content (for smaller files):
print(json.dumps(predictions, indent=4)) #indent=4 makes it easier to read. 

# Or, print specific parts of the JSON:
for intervention_id, data in predictions.items():
    print(f"Intervention ID: {intervention_id}")
    print(f"Intervention: {data['intervention']}")
    for cq in data['cqs']:
        print(f"  CQ {cq['id']}: {cq['cq']}")
    print("-" * 20)

## Validation phase: Generate predictions on validation dataset using tuned parameters

In [3]:

# Load dataset
try:
    with open("/kaggle/input/dataset-cri/validation.json", "r") as f:
        sample_data = json.load(f)
    logging.info("Dataset loaded successfully.")
except Exception as e:
    logging.error(f"Failed to load dataset: {e}")
    sample_data = {}
    
logging.info("Generating predictions on validation dataset using tuned parameters.")
validation_predictions = generate_predictions(validation_data, tuned_config, use_adaptive=False)
with open("validation_predictions.json", "w") as f:
    json.dump(validation_predictions, f, indent=2)

try:
    result_val = subprocess.run([
        "python",
        "/kaggle/input/evaluation/evaluation.py",
        "--input_path",
        "/kaggle/input/dataset-cri/validation.json",
        "--submission_path",
        "validation_predictions.json",
        "--metric",
        "similarity",
        "--threshold",
        str(threshold)
    ], check=True, capture_output=True, text=True)
    val_output = result_val.stdout.strip()
    logging.info("Validation evaluation completed successfully.")
    logging.info(f"Validation evaluation output: {val_output}")
except Exception as e:
    logging.error(f"Validation evaluation error: {str(e)}")

# ---------------------------
# Final Output: Tuned parameters and evaluation scores
# ---------------------------
print("Training complete. Tuned parameters:")
print(tuned_config)
print(f"Train Evaluation Score: {tuned_train_score}")
print("Validation predictions generated and evaluated. Check logs for detailed evaluation output.")

NameError: name 'validation_data' is not defined