In [None]:
# Install required libraries with compatible versions
!pip install torch==2.5.1 transformers==4.44.2 tokenizers==0.19.1 peft==0.12.0 datasets==2.21.0 sentencepiece==0.2.0 accelerate==0.34.2 nltk==3.9.1 evaluate==0.4.3 rouge_score==0.1.2 bert-score==0.3.13 sacrebleu==2.4.3 numpy==1.26.4 fsspec==2024.6.1 spacy==3.7.6 --force-reinstall --no-cache-dir --quiet

# Set environment variables
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Import libraries
import json
import numpy as np
import torch
import nltk
import spacy
from datasets import load_dataset
import evaluate
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
    GenerationConfig
)
from peft import LoraConfig, get_peft_model, TaskType
nltk.download("punkt", quiet=True)
nlp = spacy.load("en_core_web_sm")

# Set random seed for reproducibility
def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    set_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
seed_everything()

# Initialize model and tokenizer
model_name = "t5-base"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))

# Explicitly enable gradient checkpointing with use_reentrant=False
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

# Apply LoRA for parameter efficiency
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Debug dataset
print("Available datasets in /kaggle/input/:")
print(os.listdir('/kaggle/input/'))
print("Files in /kaggle/input/feta-dataset/:")
print(os.listdir('/kaggle/input/feta-dataset/'))
with open('/kaggle/input/feta-dataset/fetaQA-v1_train.jsonl', 'r') as f:
    print("First 3 lines of fetaQA-v1_train.jsonl:")
    for i, line in enumerate(f):
        if i < 3:
            try:
                parsed = json.loads(line.strip())
                print(f"Line {i+1} (parsed):", parsed)
            except json.JSONDecodeError as e:
                print(f"Line {i+1} (raw, failed to parse):", line.strip(), f"Error: {e}")
        else:
            break

# Load FeTaQA dataset
dataset = load_dataset('json', data_files={
    'train': '/kaggle/input/feta-dataset/fetaQA-v1_train.jsonl',
    'valid': '/kaggle/input/feta-dataset/fetaQA-v1_dev.jsonl',
    'test': '/kaggle/input/feta-dataset/fetaQA-v1_test.jsonl'
})
print("Sample train example:", dataset['train'][0])

# Question Parsing Module
def parse_question(question):
    doc = nlp(question)
    keywords = [token.text.lower() for token in doc if token.pos_ in ["NOUN", "PROPN", "NUM"]]
    intent = "aggregate" if any(word in question.lower() for word in ["total", "sum", "average", "count"]) else "lookup"
    return {"keywords": keywords, "intent": intent}

# Information Retrieval Module
def retrieve_relevant_cells(example):
    table_array = example['table_array']
    question = example['question']
    highlighted_cells = example['highlighted_cell_ids']
    parsed = parse_question(question)
    keywords = parsed["keywords"]
    intent = parsed["intent"]
    
    header = table_array[0]
    rows = table_array[1:]
    
    relevant_cols = [i for i, col in enumerate(header) if any(kw.lower() in str(col).lower() for kw in keywords)]
    if not relevant_cols:
        relevant_cols = [i for i, _ in enumerate(header)]
    
    relevant_rows = set()
    for cell_id in highlighted_cells:
        if cell_id[0] > 0:
            relevant_rows.add(cell_id[0] - 1)
    for i, row in enumerate(rows):
        if any(any(kw.lower() in str(cell).lower() for kw in keywords) for cell in row):
            relevant_rows.add(i)
    
    relevant_cells = [[row_idx + 1, col_idx] for row_idx in relevant_rows for col_idx in relevant_cols]
    return relevant_cells, intent

# Reasoning Module
def reason_over_cells(table_array, relevant_cells, intent):
    if intent == "aggregate":
        values = []
        for cell in relevant_cells:
            row, col = cell
            try:
                value = float(table_array[row][col])
                values.append(value)
            except (ValueError, TypeError):
                continue
        if values:
            return f"Aggregated value: {sum(values) / len(values) if 'average' in intent else sum(values)}"
    return None

# Enhanced Table Linearization
def linearize_table_context(example):
    table_array = example['table_array']
    question = example['question']
    relevant_cells, intent = retrieve_relevant_cells(example)
    
    header = table_array[0]
    linearized = f"[QUESTION] {question} [INTENT] {intent} [TABLE] "
    linearized += " | ".join(str(cell) for cell in header) + " [ROWS] "
    
    for i, row in enumerate(table_array[1:], 1):
        row_str = []
        for j, cell in enumerate(row):
            if [i, j] in relevant_cells:
                cell = f"*{cell}*"
            row_str.append(str(cell))
        linearized += " | ".join(row_str) + " [ROW] "
    
    reasoning_output = reason_over_cells(table_array, relevant_cells, intent)
    if reasoning_output:
        linearized += f"[REASONING] {reasoning_output} "
    
    return linearized

# Preprocess dataset
def preprocess_examples(examples):
    prefix = 'answer: '
    inputs = [prefix + linearize_table_context(example) for example in examples]
    answers = [example['answer'] for example in examples]
    
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    labels = tokenizer(
        answers,
        max_length=64,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).input_ids
    
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
def process_batch(batch):
    keys = batch.keys()
    batch = [{k: batch[k][i] for k in keys} for i in range(len(batch[list(keys)[0]]))]
    return preprocess_examples(batch)

encoded_train_ds = dataset['train'].map(process_batch, batched=True, remove_columns=dataset['train'].column_names)
encoded_val_ds = dataset['valid'].map(process_batch, batched=True, remove_columns=dataset['valid'].column_names)
encoded_test_ds = dataset['test'].map(process_batch, batched=True, remove_columns=dataset['test'].column_names)

# Post-process text for evaluation
def postprocess_text(preds, labels, metric_name):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    if metric_name == "rouge":
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    elif metric_name == "sacrebleu":
        labels = [[label] for label in labels]
    return preds, labels

# Compute evaluation metrics with robust handling
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Clip token IDs to valid range
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)
    
    # Decode predictions with error handling
    decoded_preds = []
    for pred in preds:
        try:
            decoded = tokenizer.decode(pred, skip_special_tokens=True)
            decoded_preds.append(decoded if decoded else "<empty>")
        except Exception as e:
            print(f"Error decoding prediction: {e}")
            decoded_preds.append("<error>")
    
    # Decode labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
    
    # Save predictions and labels for debugging
    save_path = os.path.join(training_args.output_dir, "predictions.json")
    with open(save_path, "w") as f:
        json.dump({"predictions": decoded_preds, "labels": decoded_labels}, f, indent=4)
    
    result = {}
    for metric_name in ["sacrebleu", "rouge", "bertscore"]:
        metric = evaluate.load(metric_name)
        decoded_preds_proc, decoded_labels_proc = postprocess_text(decoded_preds, decoded_labels, metric_name)
        try:
            if metric_name == "rouge":
                res = metric.compute(predictions=decoded_preds_proc, references=decoded_labels_proc)
                result.update({f"rouge{k}": round(v, 4) for k, v in res.items()})
            elif metric_name == "sacrebleu":
                res = metric.compute(predictions=decoded_preds_proc, references=decoded_labels_proc)
                result["sacrebleu"] = round(res["score"], 4)
            elif metric_name == "bertscore":
                res = metric.compute(predictions=decoded_preds_proc, references=decoded_labels_proc, lang="en", model_type="roberta-base")
                result.update({
                    "bertscore_precision": round(np.mean(res["precision"]), 4),
                    "bertscore_recall": round(np.mean(res["recall"]), 4),
                    "bertscore_f1": round(np.mean(res["f1"]), 4)
                })
        except Exception as e:
            print(f"Error computing {metric_name}: {e}")
            result[metric_name] = 0.0
    
    return result

# Training arguments with robust generation config
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=3e-5,
    optim="adamw_torch",
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=4,  # Use beam search for stable generation
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    fp16=torch.cuda.is_available(),
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="bertscore_f1",
    report_to="none",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_ds,
    eval_dataset=encoded_val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on test set
test_results = trainer.predict(encoded_test_ds)
print("Test Results:", test_results.metrics)

# Save the model
trainer.save_model("/kaggle/working/tableqa_model")

In [None]:
# Install required libraries with compatible versions
!pip install torch==2.5.1 torchvision==0.20.1 transformers==4.44.2 tokenizers==0.19.1 peft==0.12.0 datasets==2.21.0 sentencepiece==0.2.0 accelerate==0.34.2 nltk==3.9.1 evaluate==0.4.3 rouge_score==0.1.2 bert-score==0.3.13 sacrebleu==2.4.3 numpy==1.26.4 fsspec==2024.6.1 --force-reinstall --no-cache-dir --quiet

# Import libraries
import numpy as np
import torch
import os
import nltk
import json
from datasets import load_dataset
import evaluate
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed
)

nltk.download("punkt", quiet=True)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set random seed for reproducibility
def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    set_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

seed_everything()

# Debug dataset path and file content
print("Available datasets in /kaggle/input/:")
print(os.listdir('/kaggle/input/'))
print("Files in /kaggle/input/feta-dataset/:")
print(os.listdir('/kaggle/input/feta-dataset/'))

# Inspect first few lines of the JSONL file
with open('/kaggle/input/feta-dataset/fetaQA-v1_train.jsonl', 'r') as f:
    print("First 3 lines of fetaQA-v1_train.jsonl:")
    for i, line in enumerate(f):
        if i < 3:
            try:
                parsed = json.loads(line.strip())
                print(f"Line {i+1} (parsed):", parsed)
            except json.JSONDecodeError as e:
                print(f"Line {i+1} (raw, failed to parse):", line.strip(), f"Error: {e}")
        else:
            break

# Load FeTaQA dataset (JSONL format)
dataset = load_dataset('json', data_files={
    'train': '/kaggle/input/feta-dataset/fetaQA-v1_train.jsonl',
    'valid': '/kaggle/input/feta-dataset/fetaQA-v1_dev.jsonl',
    'test': '/kaggle/input/feta-dataset/fetaQA-v1_test.jsonl'
})

# Verify dataset structure
print("Sample train example:", dataset['train'][0])

# Initialize model and tokenizer
model_name = "t5-base"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=1024)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))

# Enhanced table linearization
def linearize_table_context(example):
    table_array = example['table_array']
    question = example['question']
    highlighted_cells = example['highlighted_cell_ids']
    
    # Create a header row
    header = table_array[0]
    linearized = f"[QUESTION] {question} [TABLE] "
    
    # Add headers
    linearized += " | ".join(str(cell) for cell in header) + " [ROWS] "
    
    # Add rows with highlighted cell emphasis
    for i, row in enumerate(table_array[1:], 1):
        row_str = []
        for j, cell in enumerate(row):
            if [i, j] in highlighted_cells:
                cell = f"*{cell}*"  # Emphasize highlighted cells
            row_str.append(str(cell))
        linearized += " | ".join(row_str) + " [ROW] "
    
    return linearized

# Preprocess dataset
def preprocess_examples(examples):
    prefix = 'answer: '
    inputs = [prefix + linearize_table_context(example) for example in examples]
    answers = [example['answer'] for example in examples]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=1024,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    # Tokenize labels
    labels = tokenizer(
        answers,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).input_ids
    
    # Replace padding token id with -100 for loss calculation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing with explicit batch processing
def process_batch(batch):
    # Debug batch input
    print("Batch type:", type(batch))
    print("Batch keys:", batch.keys())
    
    # Convert dictionary of lists to list of dictionaries
    keys = batch.keys()
    batch = [{k: batch[k][i] for k in keys} for i in range(len(batch[list(keys)[0]]))]
    
    # Debug first example in batch
    print("First example in batch:", batch[0], type(batch[0]))
    
    return preprocess_examples(batch)

encoded_train_ds = dataset['train'].map(process_batch, batched=True, remove_columns=dataset['train'].column_names)
encoded_val_ds = dataset['valid'].map(process_batch, batched=True, remove_columns=dataset['valid'].column_names)
encoded_test_ds = dataset['test'].map(process_batch, batched=True, remove_columns=dataset['test'].column_names)

# Post-process text for evaluation
def postprocess_text(preds, labels, metric_name):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    if metric_name == "rouge":
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    elif metric_name == "sacrebleu":
        labels = [[label] for label in labels]
    
    return preds, labels

# Compute evaluation metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Save predictions and labels
    save_path = os.path.join(training_args.output_dir, "predictions.json")
    with open(save_path, "w") as f:
        json.dump({"predictions": decoded_preds, "labels": decoded_labels}, f, indent=4)
    
    result = {}
    for metric_name in ["sacrebleu", "rouge", "bertscore"]:
        metric = evaluate.load(metric_name)
        decoded_preds_proc, decoded_labels_proc = postprocess_text(decoded_preds, decoded_labels, metric_name)
        
        if metric_name == "rouge":
            res = metric.compute(predictions=decoded_preds_proc, references=decoded_labels_proc)
            result.update({f"rouge{k}": round(v, 4) for k, v in res.items()})
        elif metric_name == "sacrebleu":
            res = metric.compute(predictions=decoded_preds_proc, references=decoded_labels_proc)
            result["sacrebleu"] = round(res["score"], 4)
        elif metric_name == "bertscore":
            res = metric.compute(predictions=decoded_preds_proc, references=decoded_labels_proc, lang="en")
            result.update({
                "bertscore_precision": round(np.mean(res["precision"]), 4),
                "bertscore_recall": round(np.mean(res["recall"]), 4),
                "bertscore_f1": round(np.mean(res["f1"]), 4)
            })
    
    # Calculate average generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = round(np.mean(prediction_lens), 4)
    
    return result

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=3e-5,
    optim="adamw_torch",
    predict_with_generate=True,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    fp16=torch.cuda.is_available(),
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="bertscore_f1",
    report_to="none"
)

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_ds,
    eval_dataset=encoded_val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on test set
test_results = trainer.predict(encoded_test_ds)
print("Test Results:", test_results.metrics)

# Save the model
trainer.save_model("/kaggle/working/tableqa_model")

In [None]:
print("Available datasets in /kaggle/input/:")
print(os.listdir('/kaggle/input/'))
print("Files in /kaggle/input/feta-dataset/:")
print(os.listdir('/kaggle/input/feta-dataset/'))