In [2]:
import pandas as pd
import csv
import re
import json

In [3]:
# --- Function to clean " | " formatting ---
def clean_pipe_spacing(text):
    if not isinstance(text, str):
        return text
    return re.sub(r'\s*[\.,;:!?]*\s*\|\s*[\.,;:!?]*\s*', ' | ', text)

In [4]:
# --- Function to load and clean CSV into a DataFrame ---
def load_and_clean_csv(filepath):
    cleaned_rows = []
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=';', quotechar='"')
        headers = next(reader)

        for row in reader:
            cleaned_row = [
                cell.replace('\n', ' | ').replace('\r', ' | ').replace('\t', '') if cell else cell
                for cell in row
            ]
            cleaned_rows.append(cleaned_row)

    return pd.DataFrame(cleaned_rows, columns=headers[:len(cleaned_rows[0])])

In [5]:
# --- Load both IBM3 and IBM4 ---
df_ibm3 = load_and_clean_csv("IBM3_Only_Preprocessed.csv")
df_ibm4 = load_and_clean_csv("IBM4_Only_Preprocessed.csv")

# --- Combine both datasets ---
df_combined = pd.concat([df_ibm3, df_ibm4], ignore_index=True)

# --- Group by Case-ID and aggregate fields ---
grouped = df_combined.groupby("Case-ID").agg({
    "Issues": lambda x: "\n- " + "\n- ".join(clean_pipe_spacing(i) for i in x.dropna().astype(str)),
    "Source": lambda x: ", ".join(set(x.dropna().astype(str))),
    "Resolution_status": lambda x: ", ".join(set(x.dropna().astype(str)))
}).reset_index()

# --- Format final text for Doccano ---
grouped["text"] = grouped.apply(
    lambda row: clean_pipe_spacing(
        f"Case-ID: {row['Case-ID']}\nSources: {row['Source']}\nIssues:{row['Issues']}\nResolution Status: {row['Resolution_status']}"
    ),
    axis=1
)

In [28]:
# --- Prepare for Doccano import ---
doccano_ready = grouped[["text"]].copy()
doccano_ready["label"] = [[]] * len(doccano_ready)

# --- Export to JSONL ---
doccano_ready.to_json("doccano_ibm3_ibm4_grouped_ready.jsonl", orient="records", lines=True)

In [None]:
# --- Stats ---
print(" Combined Doccano file saved as: doccano_ibm3_ibm4_grouped_ready.jsonl")
print("Total grouped cases:", doccano_ready.shape[0])
print("Unique Case-IDs in combined data:", df_combined["Case-ID"].nunique())

✅ Combined Doccano file saved as: doccano_ibm3_ibm4_grouped_ready.jsonl
Total grouped cases: 186
Unique Case-IDs in combined data: 186


In [None]:
# --- List of selected 20 case IDs for few-shot prompting ---

import json

selected_case_ids = [
    "IBM3_C4_16-Oct-13_16-Oct-13",
    "IBM3_C26_21-Sep-16_22-Sep-16",
    "IBM3_C29_05-Jul-17_12-Jul-17",
    "IBM3_C38_18-Nov-19_18-Nov-19",
    "IBM3_C45_26-Oct-20_26-Oct-20",
    "IBM3_C46_26-Oct-20_26-Oct-20",
    "IBM3_C53_31-Mar-21_31-Mar-21",
    "IBM3_C55_10-Aug-21_10-Aug-21",
    "IBM3_C60_29_Mar_22-15-Apr-22",
    "IBM3_C74_12-Sep-23_12-Sep-23",
    "IBM3_C9_28-Jul-14_28-Jul-14",
    "IBM4_C37_25-Jun-18_25-Jun-18",
    "IBM4_C59_05-Aug-21_05-Aug-21",
    "IBM4_C62_23-Dec-21_23-Dec-21",
    "IBM4_C74_01-Dec-22_01-Dec-22",
    "IBM4_C77_18-Mar-23_19-Apr-23",
    "IBM4_C79_13-Jun-23_13-Jun-23",
    "IBM4_C91_23-Sept-24_01-Oct-24",
    "IBM3_C15_23-Feb-15_23-Feb-15",
    "IBM4_C31_27-Mar-17_27-Mar-17"
]

# Filter the grouped DataFrame
selected_for_prompting = grouped[grouped["Case-ID"].isin(selected_case_ids)].copy()

# Print which case IDs were not found
missing = set(selected_case_ids) - set(selected_for_prompting["Case-ID"].tolist())
print(" Missing case(s):", missing)

# Save directly as plain string objects in JSONL (not as dict with keys)
with open("20_cases_for_baml_fewshot.jsonl", "w", encoding="utf-8") as f:
    for _, row in selected_for_prompting.iterrows():
        json.dump(row["text"], f, ensure_ascii=False)
        f.write("\n")

print(" Exported 20 cases for few-shot prompting to: 20_cases_for_baml_fewshot.jsonl")



❗ Missing case(s): set()
✅ Exported 20 cases for few-shot prompting to: 20_cases_for_baml_fewshot.jsonl


In [9]:
# Preparing the remaining cases for few-shot prompting

# --- IDs explicitly excluded (20 evaluation cases) ---
excluded_evaluation_ids = {
    "IBM3_C4_16-Oct-13_16-Oct-13",
    "IBM3_C26_21-Sep-16_22-Sep-16",
    "IBM3_C29_05-Jul-17_12-Jul-17",
    "IBM3_C38_18-Nov-19_18-Nov-19",
    "IBM3_C45_26-Oct-20_26-Oct-20",
    "IBM3_C46_26-Oct-20_26-Oct-20",
    "IBM3_C53_31-Mar-21_31-Mar-21",
    "IBM3_C55_10-Aug-21_10-Aug-21",
    "IBM3_C60_29_Mar_22-15-Apr-22",
    "IBM3_C74_12-Sep-23_12-Sep-23",
    "IBM3_C9_28-Jul-14_28-Jul-14",
    "IBM4_C37_25-Jun-18_25-Jun-18",
    "IBM4_C59_05-Aug-21_05-Aug-21",
    "IBM4_C62_23-Dec-21_23-Dec-21",
    "IBM4_C74_01-Dec-22_01-Dec-22",
    "IBM4_C77_18-Mar-23_19-Apr-23",
    "IBM4_C79_13-Jun-23_13-Jun-23",
    "IBM4_C91_23-Sept-24_01-Oct-24",
    "IBM3_C15_23-Feb-15_23-Feb-15",
    "IBM4_C31_27-Mar-17_27-Mar-17"
}

# --- ALSO exclude the 5 few-shot examples used (recommended) ---
excluded_fewshot_example_ids = {
    "IBM3_C21_19-Jul-16_01-Aug-16",
    "IBM3_C22_03-Aug-16_04-Aug-16",
    "IBM4_C29_06-Dec-16_06-Dec-16",
    "IBM4_C41_04-Jul-18_04-Jul-18",
    "IBM4_C51_03-Jul-20_03-Jul-20"
}

# Combine both sets
all_excluded_ids = excluded_evaluation_ids | excluded_fewshot_example_ids

# --- Filter out excluded cases ---
remaining_cases = grouped[~grouped["Case-ID"].isin(all_excluded_ids)].copy()

# --- Filter to only include cases with 'resolved' or 'unknown' resolution status ---
remaining_cases = grouped[
    (~grouped["Case-ID"].isin(all_excluded_ids)) &
    (grouped["Resolution_status"].str.strip().str.lower().isin(["resolved", "unknown"]))
].copy()

# --- Export remaining cases to a new file ---
output_filename = "remaining_cases_for_baml_fewshot.jsonl"
with open(output_filename, "w", encoding="utf-8") as f:
    for _, row in remaining_cases.iterrows():
        json.dump(row["text"], f, ensure_ascii=False)
        f.write("\n")

print(f"Exported remaining cases to: {output_filename}")
print(f"Total remaining cases: {len(remaining_cases)}")
print(f"Excluded evaluation cases: {len(excluded_evaluation_ids)}")
print(f"Excluded few-shot example cases: {len(excluded_fewshot_example_ids)}")

Exported remaining cases to: remaining_cases_for_baml_fewshot.jsonl
Total remaining cases: 87
Excluded evaluation cases: 20
Excluded few-shot example cases: 5
