In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!df -h


Filesystem      Size  Used Avail Use% Mounted on
overlay         113G   39G   74G  35% /
tmpfs            64M     0   64M   0% /dev
shm             5.7G     0  5.7G   0% /dev/shm
/dev/root       2.0G  1.2G  750M  62% /usr/sbin/docker-init
/dev/sda1        74G   41G   33G  56% /kaggle/input
tmpfs           6.4G   64K  6.4G   1% /var/colab
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware
drive            15G   11G  5.0G  67% /content/drive


In [3]:
!rm -rf /content/sample_data
!rm -rf /content/*.zip
!rm -rf /content/*.tar*
!rm -rf /content/outputs
!rm -rf /content/acd_model
!rm -rf /content/wandb


In [4]:
# ============================
# Cell 1 — Imports
# ============================
import os
import re
import json
import ast
import glob
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd

In [5]:
from dataclasses import dataclass
from typing import Tuple, Dict
import os

@dataclass
class CFG:
    # Root folder containing ALL datasets (train/test/tasks/etc.)
    raw_data_root: str = (
        r"/content/drive/MyDrive/Capstone Project/Datasets/Main Dataset/MAML (ASC) Datasets"
    )

    # Output folder
    out_dir: str = r"./processed"

    # Output filenames
    out_train_csv: str = "trainmaml.csv"
    out_test_csv: str  = "testmaml.csv"

    # Column candidates (auto-detect)
    sentence_cols: Tuple[str, ...] = (
        "sentence", "text", "review", "review_text", "content"
    )

    aspect_container_cols: Tuple[str, ...] = (
        "aspects", "aspect", "aspect_terms",
        "aspect_categories", "labels",
        "annotations", "opinions"
    )

    # Sentiment mapping
    sentiment_norm: Dict[str, str] = None

    # Allowed sentiments
    allowed_sentiments: Tuple[str, ...] = ("negative", "neutral", "positive")

    # Cleaning options
    drop_duplicates: bool = True
    strip_whitespace: bool = True


cfg = CFG(
    sentiment_norm={
        "neg": "negative",
        "negative": "negative",
        "-1": "negative",
        "neu": "neutral",
        "neutral": "neutral",
        "0": "neutral",
        "pos": "positive",
        "positive": "positive",
        "+1": "positive",
    }
)

# Create output directory
os.makedirs(cfg.out_dir, exist_ok=True)

print("Dataset root:", os.path.abspath(cfg.raw_data_root))
print("Output dir:", os.path.abspath(cfg.out_dir))


Dataset root: /content/drive/MyDrive/Capstone Project/Datasets/Main Dataset/MAML (ASC) Datasets
Output dir: /content/processed


In [6]:
import os
print(os.listdir("/content/drive/MyDrive/Capstone Project/Datasets/Main Dataset/MAML (ASC) Datasets"))


['testmaml.csv', 'trainmaml.csv']


In [7]:
# ============================
# Cell 3 — Helpers
# ============================
from typing import List, Optional, Any, Tuple


def _collect_files(path: str, exts: Tuple[str, ...] = (".csv", ".tsv", ".json", ".xml")) -> List[str]:
    path = os.path.expanduser(path)
    if os.path.isfile(path):
        return [path]
    if os.path.isdir(path):
        files = []
        for ext in exts:
            files.extend(glob.glob(os.path.join(path, f"**/*{ext}"), recursive=True))
        return sorted(list(set(files)))
    raise FileNotFoundError(f"Not found: {path}")


def _pick_first_existing(cols: Tuple[str, ...], df_cols: List[str]) -> Optional[str]:
    df_cols_l = {c.lower(): c for c in df_cols}
    for c in cols:
        if c.lower() in df_cols_l:
            return df_cols_l[c.lower()]
    return None


def _norm_sentiment(x: Any) -> Optional[str]:
    if x is None:
        return None
    s = str(x).strip().lower()
    s = re.sub(r"\s+", "", s)
    if s in cfg.sentiment_norm:
        s = cfg.sentiment_norm[s]
    return s if s in cfg.allowed_sentiments else None


In [8]:
# ============================
# Cell 4 — Parse aspect container
# ============================

def _safe_load_obj(s: str) -> Any:
    """Try JSON, then Python literal."""
    if s is None:
        return None
    if not isinstance(s, str):
        return s
    ss = s.strip()
    if not ss:
        return None
    # Try JSON first
    try:
        return json.loads(ss)
    except Exception:
        pass
    # Try Python literal next
    try:
        return ast.literal_eval(ss)
    except Exception:
        return None


def _extract_pairs_from_obj(obj: Any) -> List[Tuple[str, str]]:
    """Return list of (aspect, sentiment) pairs from common structures."""
    out: List[Tuple[str, str]] = []

    if obj is None:
        return out

    # Case A: dict with keys
    if isinstance(obj, dict):
        # Common: {"aspect": "Battery", "sentiment": "positive"}
        if "aspect" in obj and ("sentiment" in obj or "polarity" in obj):
            asp = str(obj.get("aspect")).strip()
            pol = obj.get("sentiment", obj.get("polarity"))
            poln = _norm_sentiment(pol)
            if asp and poln:
                out.append((asp, poln))
            return out

        # Common: {"@category": "FOOD#QUALITY", "@polarity": "positive"}
        if "@category" in obj and "@polarity" in obj:
            asp = str(obj.get("@category")).strip()
            poln = _norm_sentiment(obj.get("@polarity"))
            if asp and poln:
                out.append((asp, poln))
            return out

        # Common: {"Battery": "positive", "Delivery": "negative"}
        # If values look like sentiments
        all_vals = list(obj.values())
        vals_norm = [_norm_sentiment(v) for v in all_vals]
        if any(v is not None for v in vals_norm):
            for k, v in obj.items():
                poln = _norm_sentiment(v)
                if poln:
                    asp = str(k).strip()
                    if asp:
                        out.append((asp, poln))
            return out

        return out

    # Case B: list of dicts
    if isinstance(obj, list):
        for it in obj:
            out.extend(_extract_pairs_from_obj(it))
        return out

    # Case C: string fallback (try to find patterns like aspect:..., polarity:...)
    if isinstance(obj, str):
        # Very loose patterns
        # Example: "{'@category': 'FOOD#QUALITY', '@polarity': 'positive'}"
        loaded = _safe_load_obj(obj)
        if loaded is not None and loaded is not obj:
            return _extract_pairs_from_obj(loaded)

        # Regex fallback
        # Try capture (category/aspect) and (polarity/sentiment)
        m = re.findall(r"(?:category|aspect)\s*[:=]\s*['\"]?([^,'\"\}]+)['\"]?.*?(?:polarity|sentiment)\s*[:=]\s*['\"]?([^,'\"\}]+)['\"]?", obj, flags=re.I)
        for a, p in m:
            poln = _norm_sentiment(p)
            if poln:
                asp = a.strip()
                if asp:
                    out.append((asp, poln))
        return out

    return out

In [9]:
# ============================
# Cell 5 — Raw CSV loader
# ============================

def load_raw_table(path: str) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext in (".csv", ".tsv"):
        sep = "\t" if ext == ".tsv" else ","
        return pd.read_csv(path, sep=sep, encoding="utf-8", engine="python")
    if ext == ".json":
        # Supports json lines or normal json
        try:
            return pd.read_json(path, lines=True)
        except Exception:
            return pd.read_json(path)
    raise ValueError(f"Unsupported table file: {path}")

In [10]:
# ============================
# Cell 6 — XML loader
# ============================

def load_semeval_xml(path: str) -> pd.DataFrame:
    import xml.etree.ElementTree as ET

    tree = ET.parse(path)
    root = tree.getroot()

    rows = []

    # Common structures seen in SemEval ABSA exports
    # <sentence> <text>...</text> <aspectCategories> <aspectCategory category="..." polarity="..."/> ...
    # Or: <Opinions> <Opinion category="..." polarity="..."/> ...

    for sent in root.iterfind(".//sentence"):
        text_el = sent.find("text")
        if text_el is None:
            continue
        sentence = (text_el.text or "").strip()
        if not sentence:
            continue

        # aspect categories
        for ac in sent.iterfind(".//aspectCategory"):
            asp = ac.attrib.get("category") or ac.attrib.get("@category")
            pol = ac.attrib.get("polarity") or ac.attrib.get("@polarity")
            poln = _norm_sentiment(pol)
            if asp and poln:
                rows.append({"sentence": sentence, "aspect": asp.strip(), "sentiment": poln})

        # opinion tags (alternative)
        for op in sent.iterfind(".//Opinion"):
            asp = op.attrib.get("category") or op.attrib.get("@category")
            pol = op.attrib.get("polarity") or op.attrib.get("@polarity")
            poln = _norm_sentiment(pol)
            if asp and poln:
                rows.append({"sentence": sentence, "aspect": asp.strip(), "sentiment": poln})

    return pd.DataFrame(rows)

In [11]:
# ============================
# Cell 7 — Table to flat converter
# ============================

def table_to_flat(df: pd.DataFrame, source_name: str = "") -> pd.DataFrame:
    df = df.copy()

    sent_col = _pick_first_existing(cfg.sentence_cols, list(df.columns))
    if not sent_col:
        raise ValueError(f"Could not detect sentence/text column in {source_name}. Columns: {list(df.columns)}")

    # Case 1: already flat
    has_aspect = "aspect" in {c.lower() for c in df.columns}
    has_sentiment = "sentiment" in {c.lower() for c in df.columns} or "polarity" in {c.lower() for c in df.columns}

    if has_aspect and has_sentiment:
        # normalize column names
        aspect_col = [c for c in df.columns if c.lower() == "aspect"][0]
        sent_like_col = [c for c in df.columns if c.lower() in ("sentiment", "polarity")][0]

        out = pd.DataFrame({
            "sentence": df[sent_col].astype(str),
            "aspect": df[aspect_col].astype(str),
            "sentiment": df[sent_like_col].apply(_norm_sentiment),
        })
        out = out.dropna(subset=["sentence", "aspect", "sentiment"]).reset_index(drop=True)
        return out

    # Case 2: aspect container column
    container_col = _pick_first_existing(cfg.aspect_container_cols, list(df.columns))
    if not container_col:
        raise ValueError(
            f"Could not detect an aspect container column in {source_name}. Columns: {list(df.columns)}\n"
            "Expected either already-flat columns (aspect + sentiment/polarity) OR a container column like 'aspects/labels/annotations'."
        )

    rows = []
    for _, r in df.iterrows():
        sentence = str(r[sent_col])
        if cfg.strip_whitespace:
            sentence = sentence.strip()
        if not sentence:
            continue

        obj = _safe_load_obj(r[container_col])
        pairs = _extract_pairs_from_obj(obj)
        for asp, pol in pairs:
            asp2 = asp.strip() if cfg.strip_whitespace else asp
            pol2 = _norm_sentiment(pol)
            if asp2 and pol2:
                rows.append({"sentence": sentence, "aspect": asp2, "sentiment": pol2})

    return pd.DataFrame(rows)


In [12]:
# ============================
# Cell 8 — Convert train
# ============================
train_files = _collect_files(cfg.raw_data_root)

print("Train files:")
for f in train_files:
    print(" -", f)

train_parts = []
for path in train_files:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".xml":
        flat = load_semeval_xml(path)
    else:
        raw = load_raw_table(path)
        flat = table_to_flat(raw, source_name=path)
    train_parts.append(flat)

train_df = pd.concat(train_parts, ignore_index=True) if train_parts else pd.DataFrame(
    columns=["sentence","aspect","sentiment"]
)
print("Train rows (pre-clean):", len(train_df))
train_df.head(5)


Train files:
 - /content/drive/MyDrive/Capstone Project/Datasets/Main Dataset/MAML (ASC) Datasets/testmaml.csv
 - /content/drive/MyDrive/Capstone Project/Datasets/Main Dataset/MAML (ASC) Datasets/trainmaml.csv
Train rows (pre-clean): 7991


Unnamed: 0,sentence,aspect,sentiment
0,"We went again and sat at the bar this time, I ...",place,neutral
1,"We went again and sat at the bar this time, I ...",food,negative
2,"The food was good, but it's not worth the wait...",food,positive
3,"The food was good, but it's not worth the wait...",service,negative
4,Waiter took our drink order and then we didn't...,staff,negative


In [13]:
# ============================
# Cell 9 — Convert test
# ============================
test_files = _collect_files(cfg.raw_data_root)

print("Test files:")
for f in test_files:
    print(" -", f)

test_parts = []
for path in test_files:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".xml":
        flat = load_semeval_xml(path)
    else:
        raw = load_raw_table(path)
        flat = table_to_flat(raw, source_name=path)
    test_parts.append(flat)

test_df = pd.concat(test_parts, ignore_index=True) if test_parts else pd.DataFrame(columns=["sentence","aspect","sentiment"])
print("Test rows (pre-clean):", len(test_df))
test_df.head(5)

Test files:
 - /content/drive/MyDrive/Capstone Project/Datasets/Main Dataset/MAML (ASC) Datasets/testmaml.csv
 - /content/drive/MyDrive/Capstone Project/Datasets/Main Dataset/MAML (ASC) Datasets/trainmaml.csv
Test rows (pre-clean): 7991


Unnamed: 0,sentence,aspect,sentiment
0,"We went again and sat at the bar this time, I ...",place,neutral
1,"We went again and sat at the bar this time, I ...",food,negative
2,"The food was good, but it's not worth the wait...",food,positive
3,"The food was good, but it's not worth the wait...",service,negative
4,Waiter took our drink order and then we didn't...,staff,negative


In [14]:
# ============================
# Cell 10 — Clean
# ============================

def clean_flat(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["sentence"] = df["sentence"].astype(str)
    df["aspect"] = df["aspect"].astype(str)
    df["sentiment"] = df["sentiment"].apply(_norm_sentiment)

    if cfg.strip_whitespace:
        df["sentence"] = df["sentence"].str.strip()
        df["aspect"] = df["aspect"].str.strip()

    df = df.dropna(subset=["sentence", "aspect", "sentiment"]).reset_index(drop=True)

    # remove empty strings
    df = df[(df["sentence"].str.len() > 0) & (df["aspect"].str.len() > 0)]

    if cfg.drop_duplicates:
        df = df.drop_duplicates(subset=["sentence", "aspect", "sentiment"]).reset_index(drop=True)

    return df


train_df = clean_flat(train_df)
test_df = clean_flat(test_df)

print("Train rows (clean):", len(train_df))
print("Test rows  (clean):", len(test_df))

Train rows (clean): 7991
Test rows  (clean): 7991


In [15]:
# ============================
# Cell 11 — Stats
# ============================

def show_stats(name: str, df: pd.DataFrame) -> None:
    print(f"\n=== {name} ===")
    print("Sentiment distribution:")
    print(df["sentiment"].value_counts(dropna=False))
    print("\nTop aspects:")
    print(df["aspect"].value_counts().head(15))
    print("\nAspect × Sentiment (head):")
    print(pd.crosstab(df["aspect"], df["sentiment"]).head(10))

show_stats("TRAIN", train_df)
show_stats("TEST", test_df)


=== TRAIN ===
Sentiment distribution:
sentiment
neutral     3470
negative    2347
positive    2174
Name: count, dtype: int64

Top aspects:
aspect
food             2598
staff            1552
miscellaneous    1090
place             775
service           709
menu              551
price             360
ambience          356
Name: count, dtype: int64

Aspect × Sentiment (head):
sentiment      negative  neutral  positive
aspect                                    
ambience            109       53       194
food                292     1456       850
menu                 43      432        76
miscellaneous       218      619       253
place               161      477       137
price               126      152        82
service             363      142       204
staff              1035      139       378

=== TEST ===
Sentiment distribution:
sentiment
neutral     3470
negative    2347
positive    2174
Name: count, dtype: int64

Top aspects:
aspect
food             2598
staff            1552
mis

In [16]:
# ============================
# Cell 12 — Save
# ============================
train_out_path = os.path.join(cfg.out_dir, cfg.out_train_csv)
test_out_path  = os.path.join(cfg.out_dir, cfg.out_test_csv)

train_df.to_csv(train_out_path, index=False, encoding="utf-8")
test_df.to_csv(test_out_path, index=False, encoding="utf-8")

print("Saved:")
print(" -", os.path.abspath(train_out_path))
print(" -", os.path.abspath(test_out_path))

Saved:
 - /content/processed/trainmaml.csv
 - /content/processed/testmaml.csv


In [17]:
# ============================
# Cell 13 — Preview
# ============================
print(pd.read_csv(train_out_path).head(3))
print(pd.read_csv(test_out_path).head(3))


                                            sentence aspect sentiment
0  We went again and sat at the bar this time, I ...  place   neutral
1  We went again and sat at the bar this time, I ...   food  negative
2  The food was good, but it's not worth the wait...   food  positive
                                            sentence aspect sentiment
0  We went again and sat at the bar this time, I ...  place   neutral
1  We went again and sat at the bar this time, I ...   food  negative
2  The food was good, but it's not worth the wait...   food  positive
