In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lrec-dataset/sentencePair.txt
/kaggle/input/lrec-dataset/sentencePair_neg.txt


In [15]:
# pip install -q openai pandas tenacity

import os, json, math
from typing import List, Dict, Any
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from openai import OpenAI

MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.0"))
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "5"))

SYSTEM_PROMPT = """You are an expert annotator for argument mining in legal and formal texts.
Classify EACH input sentence as exactly ONE of:
- Premise
- Conclusion
- Non-argumentative
Return JSON:
{"items":[{"text":"...","label":"Premise|Conclusion|Non-argumentative","confidence":0-1}, ...]}
(One label per sentence; see earlier guidelines.)"""

def _load_sentences_from_pairs(paths: List[str]) -> List[str]:
    sentences = []
    for p in paths:
        df = pd.read_csv(p, sep="\t", header=None, dtype=str, on_bad_lines="skip", quoting=3, engine="python")
        # Try common columns (3 and 6), fall back to all columns as text
        cols = []
        if df.shape[1] >= 4: cols.append(3)
        if df.shape[1] >= 7: cols.append(6)
        if not cols: cols = list(range(df.shape[1]))
        for c in cols:
            sentences.extend(df.iloc[:, c].dropna().astype(str).tolist())
    # unique & stripped
    seen, uniq = set(), []
    for s in sentences:
        s = s.strip()
        if s and s not in seen:
            seen.add(s); uniq.append(s)
    return uniq

class JsonShapeError(Exception): pass

@retry(reraise=True, stop=stop_after_attempt(MAX_RETRIES),
       wait=wait_exponential(min=2, max=30),
       retry=retry_if_exception_type((JsonShapeError, Exception)))
def _classify_batch(client: OpenAI, batch: List[str]) -> List[Dict[str, Any]]:
    resp = client.chat.completions.create(
        model=MODEL, temperature=TEMPERATURE,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": json.dumps({"sentences": batch}, ensure_ascii=False)}
        ],
    )
    data = json.loads(resp.choices[0].message.content)
    if "items" not in data or not isinstance(data["items"], list):
        raise JsonShapeError("Missing 'items' list in response")
    norm = {"premise":"Premise","conclusion":"Conclusion","non-argumentative":"Non-argumentative"}
    out = []
    for it in data["items"]:
        text = str(it.get("text","")).strip()
        label = str(it.get("label","")).strip()
        label = norm.get(label.lower(), label if label in norm.values() else "Non-argumentative")
        try:
            conf = float(it.get("confidence", None))
            conf = max(0.0, min(1.0, conf))
        except Exception:
            conf = None
        if text:
            out.append({"text": text, "label": label, "confidence": conf})
    if not out: raise JsonShapeError("Empty items")
    return out

def classify_sentences(
    files: List[str],
    out_csv: str = "sentence_classifications.csv",
    batch_size: int = 25,
):
    """
    Notebook-friendly driver. Example:
      classify_sentences(
          files=["/kaggle/input/your-dataset/sentencePair.txt",
                 "/kaggle/input/your-dataset/sentencePair_neg.txt"],
          out_csv="/kaggle/working/sentence_classifications.csv",
          batch_size=25
      )
    """
    client = OpenAI()  # requires OPENAI_API_KEY in env
    sentences = _load_sentences_from_pairs(files)
    print(f"Loaded {len(sentences)} unique sentences from {len(files)} file(s).")

    results = []
    # Simple batching (no cache for brevity here; add if you like)
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        print(f"Batch {i//batch_size + 1} / {math.ceil(len(sentences)/batch_size)} ({len(batch)} sentences)")
        items = _classify_batch(client, batch)
        # map by text in case of reordering
        mapped = {it["text"]: it for it in items}
        for s in batch:
            results.append(mapped.get(s, {"text": s, "label": "Non-argumentative", "confidence": None}))

    df = pd.DataFrame(results, columns=["text","label","confidence"])
    df.to_csv(out_csv, index=False)
    print(f"Saved {len(df)} rows to {out_csv}")
    return df

In [None]:
df = classify_sentences(
    files=[
        "//kaggle/input/new-data/sanity_50_sentences.csv",
    ],
    out_csv="/kaggle/working/sentence_classifications.csv",
    batch_size=51
)
df.head()


Loaded 51 unique sentences from 1 file(s).
Batch 1 / 1 (51 sentences)
