In [3]:
import pandas as pd
import ast
import re

src = "C:\\Users\\HP\BLP Task 2\\test_v1.xlsx"
out = "tes_translated_generated_v2.xlsx"

df = pd.read_excel(src)

# ---------- Helpers ----------
def _strip_wrapping_quotes(s: str) -> str:
    s = s.strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1].strip()
    return s

def _strip_wrapping_backslashes(s: str) -> str:
    # Remove any leading/trailing runs of backslashes
    s = re.sub(r'^\s*\\+', '', s)
    s = re.sub(r'\\+\s*$', '', s)
    return s

def _simple_unescape(s: str) -> str:
    # Be conservative: only unescape the common sequences we expect to see
    s = s.replace(r'\n', '\n').replace(r'\t', '\t').replace(r'\"', '"').replace(r"\'", "'")
    # Remove spurious backslashes before brackets/commas/quotes that often appear in dirty CSV/Excel exports
    s = re.sub(r'\\(?=[\[\]\(\)\{\},])', '', s)
    return s

def clean_test_list_text(raw) -> str:
    """Normalize the raw test_list cell into something parseable or at least searchable."""
    if pd.isna(raw):
        return ""
    s = str(raw)
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    s = _simple_unescape(s).strip()
    # Sometimes cells come double-quoted twice, strip again
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    return s.strip()

def literal_eval_list(s: str):
    """Try to literal_eval a Python list from string. Return list[str] or None."""
    try:
        obj = ast.literal_eval(s)
        # If it's a string that itself looks like a list, try once more
        if isinstance(obj, str) and (obj.strip().startswith('[') or obj.strip().startswith('(')):
            obj = ast.literal_eval(obj)
        if isinstance(obj, (list, tuple)):
            return list(obj)
    except Exception:
        return None
    return None

def regex_extract_first_assert(s: str) -> str:
    """
    Fallback: extract the first 'assert ...' using regex.
    We grab up to a reasonable delimiter: newline, comma closing a quoted item, or list/tuple close.
    """
    if not s:
        return ""
    # Make sure quotes are balanced-ish so regex works reasonably
    # Capture lazily until a likely boundary
    pat = r'assert\s.*?(?=(?<!\\)\n|(?<!\\)\r|(?<!\\)\'\s*,|(?<!\\)\"\s*,|(?<!\\)\],|(?<!\\)\)|(?<!\\)\]|$)'
    m = re.search(pat, s, flags=re.S)
    if m:
        candidate = m.group(0).strip()
        # Trim dangling delimiters/quotes/brackets
        candidate = re.sub(r'[\s,\'\"\]\)]*$', '', candidate).strip()
        return candidate
    return ""

def extract_first_assert(raw) -> str:
    """
    Robust extractor:
    1) Clean text
    2) Try literal_eval as a list and take the first string
    3) If that fails/empty, regex fallback to find first assert
    4) Final cleanup of leading/trailing quotes/backslashes
    """
    s = clean_test_list_text(raw)
    # First attempt: parse list
    lst = literal_eval_list(s)
    first = ""
    if lst:
        # Find the first non-empty string item that contains 'assert'
        for item in lst:
            if isinstance(item, str) and 'assert' in item:
                first = item.strip()
                break
        # If first item is empty string due to stray slashes, try to clean it and keep if becomes non-empty
        if not first and isinstance(lst[0], str):
            cleaned0 = _strip_wrapping_quotes(_strip_wrapping_backslashes(_simple_unescape(lst[0]))).strip()
            if 'assert' in cleaned0:
                first = cleaned0

    # Fallback to regex over the raw cleaned string if still empty
    if not first:
        first = regex_extract_first_assert(s)

    # Final touch-ups
    first = _strip_wrapping_quotes(first)
    first = _strip_wrapping_backslashes(first).strip()

    # Some rows may still carry trailing ellipses due to truncation; accept as-is if it starts with assert
    if first and not first.startswith("assert"):
        # Occasionally the word assert is missing at the beginning due to trimming; try to recover
        # by finding 'assert' inside and slicing from there.
        pos = first.find("assert")
        if pos != -1:
            first = first[pos:].strip()

    # Ensure we only keep a single line if multiple lines leaked in
    first = first.splitlines()[0].strip() if first else first
    return first

def strip_and_get_original_example(text):
    """Return text without Example line and also capture the old Example line (if any)."""
    if pd.isna(text):
        return "", None
    lines = str(text).splitlines()
    kept, old_example = [], None
    for line in lines:
        check = line.strip().strip('"').strip("'").strip()
        if re.match(r'(?i)^example\s*:', check):
            old_example = check  # store original Example
            continue
        kept.append(line)
    cleaned = "\n".join(kept).strip()
    cleaned = cleaned.strip('"').strip("'")
    return cleaned, old_example

# ---------- Transformation ----------
new_texts = []
first_asserts = []

for _, row in df.iterrows():
    base, old_ex = strip_and_get_original_example(row.get('instruction_en', ''))
    first = extract_first_assert(row.get('test_list', ''))
    first_asserts.append(first)

    if first:  # Use extracted assert
        new_texts.append(f"{base}\nExample: {first}")
    else:      # Fallback to old example if exists
        if old_ex:
            new_texts.append(f"{base}\n{old_ex}")
        else:
            new_texts.append(base)

df['first_assert'] = first_asserts
df['instruction_en_new'] = new_texts

df.to_excel(out, index=False)
print("File saved to:", out)


  src = "C:\\Users\\HP\BLP Task 2\\test_v1.xlsx"


File saved to: tes_translated_generated_v2.xlsx


In [2]:
import pandas as pd
import ast
import re

# --- Paths ---
src = "C:\\Users\\HP\BLP Task 2\\test_v1.xlsx"
out = "test_translated_generated_v1.xlsx"

df = pd.read_excel(src)

# ---------- Helpers (kept aligned with v3 behavior) ----------
def _strip_wrapping_quotes(s: str) -> str:
    s = s.strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1].strip()
    return s

def _strip_wrapping_backslashes(s: str) -> str:
    # Remove any leading/trailing runs of backslashes
    s = re.sub(r'^\s*\\+', '', s)
    s = re.sub(r'\\+\s*$', '', s)
    return s

def _simple_unescape(s: str) -> str:
    # Be conservative: only unescape the common sequences we expect to see
    s = s.replace(r'\n', '\n').replace(r'\t', '\t').replace(r'\"', '"').replace(r"\'", "'")
    # Remove spurious backslashes before brackets/commas/quotes that often appear in dirty CSV/Excel exports
    s = re.sub(r'\\(?=[\[\]\(\)\{\},])', '', s)
    return s

def clean_test_list_text(raw) -> str:
    """Normalize the raw test_list cell into something parseable or at least searchable (same spirit as v3)."""
    if pd.isna(raw):
        return ""
    s = str(raw)
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    s = _simple_unescape(s).strip()
    # Sometimes cells come double-quoted twice, strip again
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    return s.strip()

def literal_eval_list(s: str):
    """Try to literal_eval a Python list from string. Return list[str] or None."""
    try:
        obj = ast.literal_eval(s)
        # If it's a string that itself looks like a list, try once more
        if isinstance(obj, str) and (obj.strip().startswith('[') or obj.strip().startswith('(')):
            obj = ast.literal_eval(obj)
        if isinstance(obj, (list, tuple)):
            return list(obj)
    except Exception:
        return None
    return None

def regex_extract_all_asserts(s: str):
    """
    Fallback: extract ALL 'assert ...' using the v3-style boundary pattern.
    Capture lazily until a likely boundary: newline, comma closing a quoted item, or list/tuple close.
    """
    if not s:
        return []
    pat = r'assert\s.*?(?=(?<!\\)\n|(?<!\\)\r|(?<!\\)\'\s*,|(?<!\\)\"\s*,|(?<!\\)\],|(?<!\\)\)|(?<!\\)\]|$)'
    matches = re.findall(pat, s, flags=re.S)
    out = []
    for m in matches:
        candidate = m.strip()
        # Trim dangling delimiters/quotes/brackets
        candidate = re.sub(r'[\s,\'\"\]\)]*$', '', candidate).strip()
        # Final touch-ups exactly like v3 spirit
        candidate = _strip_wrapping_quotes(candidate)
        candidate = _strip_wrapping_backslashes(candidate).strip()
        if candidate and not candidate.startswith("assert"):
            pos = candidate.find("assert")
            if pos != -1:
                candidate = candidate[pos:].strip()
        if candidate:
            out.append(candidate.splitlines()[0].strip())
    return out

def extract_all_asserts(raw):
    """
    Robust extractor (v3-style, but for ALL asserts):
      1) Clean text
      2) Try literal_eval as a list -> collect strings containing 'assert'
      3) If that fails/empty, regex fallback to find all asserts
      4) Final cleanup of leading/trailing quotes/backslashes (already done)
    """
    s = clean_test_list_text(raw)

    # Attempt 1: parse list
    found = []
    lst = literal_eval_list(s)
    if lst:
        for item in lst:
            if isinstance(item, str) and 'assert' in item:
                it = item.strip()
                # v3-style cleanup
                it = _strip_wrapping_quotes(it)
                it = _strip_wrapping_backslashes(_simple_unescape(it)).strip()
                if it and not it.startswith("assert"):
                    pos = it.find("assert")
                    if pos != -1:
                        it = it[pos:].strip()
                if it:
                    it = it.splitlines()[0].strip()
                    if it.startswith('assert'):
                        found.append(it)

        # If the first element produced an empty string due to stray slashes, try cleaning it once more
        if not found and len(lst) > 0 and isinstance(lst[0], str):
            cleaned0 = _strip_wrapping_quotes(_strip_wrapping_backslashes(_simple_unescape(lst[0]))).strip()
            if 'assert' in cleaned0:
                pos = cleaned0.find("assert")
                if pos != -1:
                    cleaned0 = cleaned0[pos:].strip()
                cleaned0 = cleaned0.splitlines()[0].strip()
                if cleaned0.startswith('assert'):
                    found.append(cleaned0)

    # Attempt 2: regex over cleaned string
    if not found:
        found = regex_extract_all_asserts(s)

    # Deduplicate while preserving order
    seen = set()
    uniq = []
    for a in found:
        if a not in seen:
            seen.add(a)
            uniq.append(a)
    return uniq

# ---------- Transformation: keep Example as-is, append up to 3 asserts ----------
new_texts = []
all_asserts_col = []

for _, row in df.iterrows():
    base = str(row.get('instruction_en', '')).strip()  # KEEP as-is (including the Example line)
    asserts = extract_all_asserts(row.get('test_list', ''))
    all_asserts_col.append("\n".join(asserts))

    if asserts:
        appended = base + "\n" + "\n".join(asserts[:3])
    else:
        appended = base  # if no asserts, keep original unchanged

    new_texts.append(appended)

df['all_asserts_extracted'] = all_asserts_col
df['instruction_en_appended'] = new_texts

df.to_excel(out, index=False)
print("File saved to:", out)


  src = "C:\\Users\\HP\BLP Task 2\\test_v1.xlsx"


File saved to: test_translated_generated_v1.xlsx


In [4]:
import pandas as pd
import ast
import re
import json

# --- Paths ---
src = "C:\\Users\\HP\\BLP Task 2\\test_v1.xlsx"
out = "test_translated_generated_v4.xlsx"

df = pd.read_excel(src)

# ---------- Helpers (kept aligned with v3 behavior, but less destructive) ----------
def _strip_wrapping_quotes(s: str) -> str:
    s = s.strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1].strip()
    return s

def _strip_wrapping_backslashes(s: str) -> str:
    # Remove any leading/trailing runs of backslashes (common in messy exports)
    s = re.sub(r'^\s*\\+', '', s)
    s = re.sub(r'\\+\s*$', '', s)
    return s

def _simple_unescape(s: str) -> str:
    # Be conservative: only unescape common sequences we expect to see
    s = s.replace(r'\n', '\n').replace(r'\t', '\t').replace(r'\"', '"').replace(r"\'", "'")
    # Remove spurious backslashes before brackets/commas/quotes that often appear in dirty CSV/Excel exports
    s = re.sub(r'\\(?=[\[\]\(\)\{\},])', '', s)
    return s

def clean_test_list_text(raw) -> str:
    """Normalize the raw test_list cell into something parseable or at least searchable (same spirit as v3)."""
    if pd.isna(raw):
        return ""
    s = str(raw)
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    s = _simple_unescape(s).strip()
    # Sometimes cells come double-quoted twice, strip again
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    return s.strip()

def literal_eval_list(s: str):
    """Try to literal_eval a Python list/tuple from string. Return list[str] or None."""
    try:
        obj = ast.literal_eval(s)
        # If it's a string that itself looks like a list, try once more
        if isinstance(obj, str) and (obj.strip().startswith('[') or obj.strip().startswith('(')):
            obj = ast.literal_eval(obj)
        if isinstance(obj, (list, tuple)):
            return list(obj)
    except Exception:
        pass
    # Try JSON if literal_eval failed but it looks JSON-ish
    try:
        if s.strip().startswith('['):
            obj = json.loads(s)
            if isinstance(obj, list):
                return obj
    except Exception:
        pass
    return None

def _clean_assert_text(it: str) -> str:
    """
    Standardize one assert string:
      - ensure it starts with 'assert'
      - strip only an outer closing quote if present (e.g., "...']" from list wrapping)
    """
    it = it.strip()
    # If not starting with assert but contains it, cut from first occurrence
    if not it.startswith("assert"):
        pos = it.find("assert")
        if pos != -1:
            it = it[pos:].strip()

    # Remove a single trailing outer quote that belongs to a surrounding list item,
    # but DO NOT remove internal brackets/parentheses from the assert itself.
    # Examples to clean:  "assert ...']"  "assert ...',"  'assert ...")'
    it = re.sub(r'(?<!\\)(["\'])\s*(?=[\]\)\}],?|$)$', '', it)

    return it.strip()

def regex_extract_all_asserts(s: str):
    """
    Safer fallback: grab 'assert ...' until newline, semicolon, or EOS.
    Then, only strip an outer closing quote if present.
    """
    if not s:
        return []

    # Match 'assert' followed by anything up to a newline, semicolon, or end of string.
    # This avoids stopping at internal ] or ) which are part of the expression.
    matches = re.findall(r'assert[^\n\r;]*', s, flags=re.S)
    out = []
    for m in matches:
        candidate = _clean_assert_text(m)
        if candidate.startswith("assert"):
            out.append(candidate)
    return out

def extract_all_asserts(raw):
    """
    Robust extractor (v3-style, but for ALL asserts), fixed:
      1) Clean text
      2) Try literal_eval / JSON as a list -> collect strings containing 'assert'
      3) If that fails/empty, regex fallback to find all asserts (line-based)
      4) Deduplicate while preserving order
    """
    s = clean_test_list_text(raw)

    found = []

    # Attempt 1: parse list
    lst = literal_eval_list(s)
    if lst:
        for item in lst:
            if isinstance(item, str) and 'assert' in item:
                it = _simple_unescape(item)
                it = _strip_wrapping_quotes(it)
                it = _strip_wrapping_backslashes(it)
                it = _clean_assert_text(it)
                if it.startswith('assert'):
                    found.append(it)

        # If still nothing and first element is a string, try one more aggressive clean
        if not found and len(lst) > 0 and isinstance(lst[0], str):
            cleaned0 = _clean_assert_text(_simple_unescape(lst[0]))
            if cleaned0.startswith('assert'):
                found.append(cleaned0)

    # Attempt 2: regex over cleaned string
    if not found:
        found = regex_extract_all_asserts(s)

    # Deduplicate while preserving order
    seen = set()
    uniq = []
    for a in found:
        if a not in seen:
            seen.add(a)
            uniq.append(a)
    return uniq

# ---------- Transformation: keep Example as-is, append up to 3 asserts ----------
new_texts = []
all_asserts_col = []

for _, row in df.iterrows():
    base = str(row.get('instruction_en', '')).strip()  # KEEP as-is (including the Example line)
    asserts = extract_all_asserts(row.get('test_list', ''))
    all_asserts_col.append("\n".join(asserts))

    if asserts:
        appended = base + "\n" + "\n".join(asserts[:3])
    else:
        appended = base  # if no asserts, keep original unchanged

    new_texts.append(appended)

df['all_asserts_extracted'] = all_asserts_col
df['instruction_en_appended'] = new_texts

df.to_excel(out, index=False)
print("File saved to:", out)


File saved to: test_translated_generated_v4.xlsx


In [5]:
import pandas as pd
import ast
import json
import re

# --- Paths (edit src as needed) ---
# Example (Windows):
# src = r"C:\Users\HP\BLP Task 2\test_v1.xlsx"
# Example (this session / Colab):
# src = "/mnt/data/test_v1.xlsx"
src = "C:\\Users\\HP\\BLP Task 2\\test_v1.xlsx"
out = "test_translated_generated_v6.xlsx"

df = pd.read_excel(src)

# ---------------- Helpers ----------------
def _strip_wrapping_quotes(s: str) -> str:
    s = s.strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1].strip()
    return s

def _strip_wrapping_backslashes(s: str) -> str:
    s = re.sub(r'^\s*\\+', '', s)
    s = re.sub(r'\\+\s*$', '', s)
    return s

def _simple_unescape(s: str) -> str:
    s = s.replace(r'\n', '\n').replace(r'\t', '\t').replace(r'\"', '"').replace(r"\'", "'")
    # remove spurious escapes before common punctuation/brackets
    s = re.sub(r'\\(?=[\[\]\(\)\{\},])', '', s)
    return s

def clean_test_list_text(raw) -> str:
    if pd.isna(raw):
        return ""
    s = str(raw)
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    s = _simple_unescape(s).strip()
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    return s.strip()

def literal_eval_list(s: str):
    """Try Python literal, then JSON; return list or None."""
    # Python literal (e.g., "['assert ...', 'assert ...']")
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, str) and (obj.strip().startswith('[') or obj.strip().startswith('(')):
            obj = ast.literal_eval(obj)
        if isinstance(obj, (list, tuple)):
            return list(obj)
    except Exception:
        pass
    # JSON (e.g., ["assert ...", "assert ..."])
    try:
        if s.strip().startswith('['):
            obj = json.loads(s)
            if isinstance(obj, list):
                return obj
    except Exception:
        pass
    return None

def _clean_assert_text(it: str) -> str:
    """Normalize one assert string without deleting internal brackets/quotes."""
    it = _simple_unescape(_strip_wrapping_backslashes(_strip_wrapping_quotes(str(it)))).strip()

    # If 'assert' is inside, keep from first 'assert'
    if not it.startswith("assert"):
        pos = it.find("assert")
        if pos != -1:
            it = it[pos:].strip()

    # Remove exactly one outer trailing quote that belongs to surrounding list, not inner code
    it = re.sub(r'(?<!\\)(["\'])\s*(?=[\]\)\}],?|$)$', '', it)

    # Drop trailing comma that belongs to list, not the code
    it = re.sub(r',$', '', it).strip()

    # One-line normalize (common for test cells)
    return it.splitlines()[0].strip()

def regex_extract_all_asserts(s: str):
    """
    Fallback extractor: capture each 'assert' statement up to newline/semicolon/EOF.
    Avoids stopping at internal ] or ) which are legitimate inside the expression.
    """
    if not s:
        return []
    # Allow preceding punctuation/quotes; we start our capture from 'assert'
    candidates = re.findall(r'assert[^\n\r;]*', s, flags=re.S)
    out = []
    for m in candidates:
        a = _clean_assert_text(m)
        if a.startswith("assert"):
            out.append(a)
    return out

def extract_all_asserts(raw):
    """
    Robust extractor for ALL asserts in a cell:
      1) Clean text
      2) Try parsing list/tuple/JSON and collect strings with 'assert'
      3) If none found, regex fallback on the cleaned string
      4) Deduplicate while preserving order
    """
    s = clean_test_list_text(raw)

    found = []
    lst = literal_eval_list(s)
    if lst:
        for item in lst:
            if isinstance(item, str) and 'assert' in item:
                a = _clean_assert_text(item)
                if a.startswith("assert"):
                    found.append(a)

        # If still empty, try a second pass on the joined string (some lists contain messy quoting)
        if not found:
            joined = " ".join(map(str, lst))
            found = regex_extract_all_asserts(clean_test_list_text(joined))
    else:
        # Not a clean list — use regex fallback
        found = regex_extract_all_asserts(s)

    # Deduplicate, keep order
    seen = set()
    uniq = []
    for a in found:
        if a not in seen:
            seen.add(a)
            uniq.append(a)
    return uniq

# -------------- Transformation --------------
new_texts = []
all_asserts_col = []

for _, row in df.iterrows():
    base = str(row.get('instruction_en', '')).strip()
    asserts = extract_all_asserts(row.get('test_list', ''))
    all_asserts_col.append("\n".join(asserts))
    appended = base + ("\n" + "\n".join(asserts[:3]) if asserts else "")
    new_texts.append(appended)

df['all_asserts_extracted'] = all_asserts_col
df['instruction_en_appended'] = new_texts

df.to_excel(out, index=False)
print("File saved to:", out)


File saved to: test_translated_generated_v5.xlsx


In [7]:
import pandas as pd
import ast
import json
import re

# --- Paths (edit src as needed) ---
# Example (Windows):
# src = r"C:\Users\HP\BLP Task 2\test_v1.xlsx"
# Example (this session):
# src = "/mnt/data/test_v1.xlsx"
src = r"C:\Users\HP\BLP Task 2\test_v1.xlsx"
out = "test_translated_generated_v6.xlsx"

df = pd.read_excel(src)

# ---------------- Helpers ----------------
def _strip_wrapping_quotes(s: str) -> str:
    s = s.strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1].strip()
    return s

def _strip_wrapping_backslashes(s: str) -> str:
    s = re.sub(r'^\s*\\+', '', s)
    s = re.sub(r'\\+\s*$', '', s)
    return s

def _simple_unescape(s: str) -> str:
    s = s.replace(r'\n', '\n').replace(r'\t', '\t').replace(r'\"', '"').replace(r"\'", "'")
    s = re.sub(r'\\(?=[\[\]\(\)\{\},])', '', s)  # spurious escapes before brackets/commas
    return s

def clean_test_list_text(raw) -> str:
    if pd.isna(raw):
        return ""
    s = str(raw)
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    s = _simple_unescape(s).strip()
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    return s.strip()

def literal_eval_list(s: str):
    """Try Python literal, then JSON; return list or None."""
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, str) and (obj.strip().startswith('[') or obj.strip().startswith('(')):
            obj = ast.literal_eval(obj)
        if isinstance(obj, (list, tuple)):
            return list(obj)
    except Exception:
        pass
    try:
        if s.strip().startswith('['):
            obj = json.loads(s)
            if isinstance(obj, list):
                return obj
    except Exception:
        pass
    return None

# ---------- Repair utilities ----------
_OPENERS = {'(': ')', '[': ']', '{': '}'}
_CLOSERS = {')': '(', ']': '[', '}': '{'}

def _scan_balance(s: str):
    """
    Returns (stack, in_single, in_double) after scanning s, ignoring brackets inside quotes.
    'stack' contains unmatched openers in order.
    """
    stack = []
    in_single = False
    in_double = False
    esc = False
    for ch in s:
        if esc:
            esc = False
            continue
        if ch == '\\':
            esc = True
            continue
        if not in_double and ch == "'" and not in_single:
            in_single = True
            continue
        elif in_single and ch == "'":
            in_single = False
            continue
        if not in_single and ch == '"' and not in_double:
            in_double = True
            continue
        elif in_double and ch == '"':
            in_double = False
            continue
        if in_single or in_double:
            continue
        if ch in _OPENERS:
            stack.append(ch)
        elif ch in _CLOSERS:
            if stack and stack[-1] == _CLOSERS[ch]:
                stack.pop()
            else:
                # unmatched closer; mark by pushing a sentinel of the closer with '!' prefix
                stack.append('!' + ch)
    return stack, in_single, in_double

def _strip_trailing_unmatched_closers(s: str) -> str:
    """
    Remove only trailing unmatched ) ] } outside quotes (fixes extra 3rd bracket cases).
    """
    # Work from the end, skipping whitespace and commas
    i = len(s) - 1
    esc = False
    in_single = False
    in_double = False
    # First quick pass to know where quotes end
    # We'll do a backwards walk, but simple approach:
    # repeatedly remove trailing unmatched closers while scan indicates imbalance.
    while True:
        stack, in_s, in_d = _scan_balance(s)
        if any(tok.startswith('!') for tok in stack):
            # there is at least one unmatched closer somewhere; if it's trailing, drop it
            m = re.search(r'[\)\]\}]+\s*$', s)
            if m:
                # remove one trailing closer
                tail = m.group(0)
                # remove a single char closer from the tail
                for idx in range(len(tail)-1, -1, -1):
                    if tail[idx] in _CLOSERS:
                        s = s[:m.start() + idx] + tail[idx+1:]
                        break
            else:
                break
        else:
            break
    return s

def _append_needed_closers(s: str) -> str:
    """
    Append the minimal set of missing quote/bracket closers to complete the line.
    """
    # Close open quotes
    stack, in_single, in_double = _scan_balance(s)
    if in_single:
        s += "'"
    if in_double:
        s += '"'

    # Re-scan after closing quotes (so brackets can be seen correctly)
    stack, in_single, in_double = _scan_balance(s)
    # If any unmatched closers ('!') remain, try stripping trailing ones
    if any(tok.startswith('!') for tok in stack):
        s = _strip_trailing_unmatched_closers(s)
        stack, in_single, in_double = _scan_balance(s)

    # Now close remaining openers
    closers_to_add = ''.join(_OPENERS[o] for o in reversed([x for x in stack if not x.startswith('!')]))
    if closers_to_add:
        s += closers_to_add
    return s

def _ast_parses(s: str) -> bool:
    try:
        ast.parse(s)
        return True
    except Exception:
        return False

def _finalize_assert_line(s: str) -> str:
    """
    Ensure the extracted line:
      - starts with 'assert'
      - has balanced quotes/brackets
      - no trailing garbage commas/quotes/extra final bracket
    """
    s = s.strip()

    # If not starting with assert but contains it, keep from first 'assert'
    if not s.startswith("assert"):
        pos = s.find("assert")
        if pos != -1:
            s = s[pos:].strip()

    # Remove trailing list punctuation from exports (outer quote/comma/bracket)
    s = re.sub(r'\s*(["\'])\s*$', '', s)      # trailing outer quote
    s = re.sub(r'\s*,\s*$', '', s)            # trailing comma
    # Remove a single trailing unmatched closer if present (we'll re-balance anyway)
    s = re.sub(r'\s*[\)\]\}]\s*$', lambda m: '' if not _ast_parses(s) else m.group(0), s)

    # Trim to one visual line (asserts are usually single-line in the sheet)
    s = s.splitlines()[0].strip()

    # Try parse; if fails, repair by balancing quotes/brackets
    if not _ast_parses(s):
        s = _strip_trailing_unmatched_closers(s)
        s = _append_needed_closers(s)
        # If still not parsable, last resort: stop at first semicolon
        if not _ast_parses(s):
            s = re.split(r'[;\r\n]', s)[0].strip()

    return s

# ---------- Extractors ----------
def regex_extract_all_asserts(s: str):
    """
    Fallback extractor: capture 'assert' statements up to newline/semicolon/EOF,
    then repair each (handles extra trailing brackets and missing quotes).
    """
    if not s:
        return []
    candidates = re.findall(r'assert[^\n\r;]*', s, flags=re.S)
    out = []
    for m in candidates:
        a = _finalize_assert_line(_simple_unescape(_strip_wrapping_backslashes(_strip_wrapping_quotes(m))))
        if a.startswith("assert"):
            out.append(a)
    return out

def extract_all_asserts(raw):
    """
    Robust extractor for ALL asserts in a cell:
      1) Clean text
      2) Try parsing as list/tuple/JSON and collect strings with 'assert'
      3) If none found, regex fallback
      4) Normalize/repair each assert
      5) Deduplicate while preserving order
    """
    s = clean_test_list_text(raw)
    found = []

    lst = literal_eval_list(s)
    if lst:
        for item in lst:
            if isinstance(item, str) and 'assert' in item:
                a = _finalize_assert_line(item)
                if a.startswith("assert"):
                    found.append(a)
        if not found:  # messy list elements: join and fallback
            joined = " ".join(map(str, lst))
            found = regex_extract_all_asserts(clean_test_list_text(joined))
    else:
        found = regex_extract_all_asserts(s)

    # Deduplicate, keep order
    seen = set()
    uniq = []
    for a in found:
        if a not in seen:
            seen.add(a)
            uniq.append(a)
    return uniq

# -------------- Transformation --------------
new_texts = []
all_asserts_col = []

for _, row in df.iterrows():
    base = str(row.get('instruction_en', '')).strip()
    asserts = extract_all_asserts(row.get('test_list', ''))
    all_asserts_col.append("\n".join(asserts))
    appended = base + ("\n" + "\n".join(asserts[:3]) if asserts else "")
    new_texts.append(appended)

df['all_asserts_extracted'] = all_asserts_col
df['instruction_en_appended'] = new_texts

df.to_excel(out, index=False)
print("File saved to:", out)


File saved to: test_translated_generated_v6.xlsx


In [1]:
import pandas as pd
import ast
import json
import re

# --- Paths (edit src as needed) ---
# Example (Windows):
# src = r"C:\Users\HP\BLP Task 2\test_v1.xlsx"
# Example (this session):
# src = "/mnt/data/test_v1.xlsx"
#src = r"C:\Users\HP\BLP Task 2\test_v1.xlsx"
src = "C:\\Users\\HP\\BLP Task 2\\test_250_v2.csv"
out = "test_translated_generated_v7.xlsx"

df = pd.read_csv(src)

# ---------------- Helpers ----------------
def _strip_wrapping_quotes(s: str) -> str:
    s = s.strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1].strip()
    return s

def _strip_wrapping_backslashes(s: str) -> str:
    # Remove runs of backslashes that wrap the WHOLE cell or item, not the ones inside code
    s = re.sub(r'^\s*\\+', '', s)
    s = re.sub(r'\\+\s*$', '', s)
    return s

def _unescape_light_for_items(s: str) -> str:
    """
    Light unescape for ASSERT *items only*:
      - Don't turn '\n' or '\t' into actual newlines/tabs (keeps asserts single-line).
      - Do unescape \" and \'
      - Do drop spurious escapes before brackets/commas (common export artifact)
    """
    s = s.replace(r'\"', '"').replace(r"\'", "'")
    s = re.sub(r'\\(?=[\[\]\(\)\{\},])', '', s)
    return s

def clean_test_list_text(raw) -> str:
    """
    Normalize the raw test_list cell into something parseable.
    IMPORTANT: DO NOT unescape \n at the cell level; it breaks literal_eval and regex boundaries.
    """
    if pd.isna(raw):
        return ""
    s = str(raw)
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    # No unescape here (especially not \n or \t)
    s = _strip_wrapping_quotes(s)
    s = _strip_wrapping_backslashes(s)
    return s.strip()

def literal_eval_list(s: str):
    """Try Python literal, then JSON; return list or None."""
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, str) and (obj.strip().startswith('[') or obj.strip().startswith('(')):
            obj = ast.literal_eval(obj)
        if isinstance(obj, (list, tuple)):
            return list(obj)
    except Exception:
        pass
    try:
        if s.strip().startswith('['):
            obj = json.loads(s)
            if isinstance(obj, list):
                return obj
    except Exception:
        pass
    return None

# ---------- Repair utilities ----------
_OPENERS = {'(': ')', '[': ']', '{': '}'}
_CLOSERS = {')': '(', ']': '[', '}': '{'}

def _scan_balance(s: str):
    """
    Returns (stack, in_single, in_double) after scanning s, ignoring brackets inside quotes.
    'stack' contains unmatched openers in order; '!' + closer denotes an unmatched closer encountered.
    """
    stack = []
    in_single = False
    in_double = False
    esc = False
    for ch in s:
        if esc:
            esc = False
            continue
        if ch == '\\':
            esc = True
            continue
        if not in_double and ch == "'" and not in_single:
            in_single = True
            continue
        elif in_single and ch == "'":
            in_single = False
            continue
        if not in_single and ch == '"' and not in_double:
            in_double = True
            continue
        elif in_double and ch == '"':
            in_double = False
            continue
        if in_single or in_double:
            continue
        if ch in _OPENERS:
            stack.append(ch)
        elif ch in _CLOSERS:
            if stack and stack[-1] == _CLOSERS[ch]:
                stack.pop()
            else:
                stack.append('!' + ch)
    return stack, in_single, in_double

def _strip_trailing_unmatched_closers(s: str) -> str:
    """
    Remove only trailing unmatched ) ] } outside quotes (fixes extra closing bracket cases).
    """
    while True:
        stack, _, _ = _scan_balance(s)
        if any(tok.startswith('!') for tok in stack):
            m = re.search(r'[\)\]\}]+\s*$', s)
            if m:
                # remove just one trailing closer; loop will repeat if more exist
                tail = m.group(0)
                for idx in range(len(tail) - 1, -1, -1):
                    if tail[idx] in _CLOSERS:
                        s = s[:m.start() + idx] + tail[idx + 1:]
                        break
            else:
                break
        else:
            break
    return s

def _append_needed_closers(s: str) -> str:
    """
    Append the minimal set of missing quote/bracket closers to complete the line.
    """
    stack, in_single, in_double = _scan_balance(s)
    if in_single:
        s += "'"
    if in_double:
        s += '"'
    stack, _, _ = _scan_balance(s)
    if any(tok.startswith('!') for tok in stack):
        s = _strip_trailing_unmatched_closers(s)
        stack, _, _ = _scan_balance(s)
    closers_to_add = ''.join(_OPENERS[o] for o in reversed([x for x in stack if not x.startswith('!')]))
    if closers_to_add:
        s += closers_to_add
    return s

def _ast_parses(s: str) -> bool:
    try:
        ast.parse(s)
        return True
    except Exception:
        return False

def _finalize_assert_line(s: str) -> str:
    """
    Ensure the extracted line:
      - starts with 'assert'
      - retains '\n' as literal backslash-n (not real newline)
      - has balanced quotes/brackets
      - no trailing garbage commas/quotes/extra final bracket
    """
    s = str(s)
    s = _strip_wrapping_quotes(_strip_wrapping_backslashes(s))
    s = _unescape_light_for_items(s)  # DO NOT turn '\n' into actual newline
    s = s.strip()

    if not s.startswith("assert"):
        pos = s.find("assert")
        if pos != -1:
            s = s[pos:].strip()

    # Remove trailing outer quote/comma from list wrapping
    s = re.sub(r'\s*(["\'])\s*$', '', s)
    s = re.sub(r'\s*,\s*$', '', s)

    # If there is an unmatched trailing closer, drop it; we'll re-balance later if needed
    s = re.sub(r'\s*[\)\]\}]\s*$', lambda m: '' if not _ast_parses(s) else m.group(0), s)

    # Keep it single-line (asserts are meant to be one-liners in the sheet)
    # IMPORTANT: We *did not* convert '\n' to a real newline, so this won't split inside code.
    s = s.splitlines()[0].strip()

    if not _ast_parses(s):
        s = _strip_trailing_unmatched_closers(s)
        s = _append_needed_closers(s)
        if not _ast_parses(s):
            s = re.split(r'[;\r\n]', s)[0].strip()

    return s

# ---------- Extractors ----------
def regex_extract_all_asserts(s: str):
    """
    Fallback extractor: capture 'assert' statements up to newline/semicolon/EOF,
    then repair each (handles extra trailing brackets and missing quotes).
    NOTE: We purposely do NOT unescape '\n' before regex so matches stay on one visual line.
    """
    if not s:
        return []
    candidates = re.findall(r'assert[^\n\r;]*', s, flags=re.S)
    out = []
    for m in candidates:
        a = _finalize_assert_line(m)
        if a.startswith("assert"):
            out.append(a)
    return out

def extract_all_asserts(raw):
    """
    Robust extractor for ALL asserts in a cell:
      1) Clean text (no unescape of \n at cell level)
      2) Try parsing as list/tuple/JSON and collect strings with 'assert'
      3) If none found, regex fallback
      4) Normalize/repair each assert
      5) Deduplicate while preserving order
    """
    s = clean_test_list_text(raw)
    found = []

    lst = literal_eval_list(s)
    if lst:
        for item in lst:
            if isinstance(item, str) and 'assert' in item:
                a = _finalize_assert_line(item)
                if a.startswith("assert"):
                    found.append(a)
        if not found:
            # messy list elements: join and fallback
            joined = " ".join(map(str, lst))
            found = regex_extract_all_asserts(clean_test_list_text(joined))
    else:
        found = regex_extract_all_asserts(s)

    # Deduplicate, keep order
    seen = set()
    uniq = []
    for a in found:
        if a not in seen:
            seen.add(a)
            uniq.append(a)
    return uniq

# -------------- Transformation --------------
new_texts = []
all_asserts_col = []

for _, row in df.iterrows():
    base = str(row.get('instruction_en', '')).strip()
    asserts = extract_all_asserts(row.get('test_list', ''))
    all_asserts_col.append("\n".join(asserts))
    appended = base + ("\n" + "\n".join(asserts[:3]) if asserts else "")
    new_texts.append(appended)

df['all_asserts_extracted'] = all_asserts_col
df['instruction_en_appended'] = new_texts

df.to_excel(out, index=False)
print("File saved to:", out)


File saved to: test_translated_generated_v7.xlsx
