In [2]:
import os
import re
import pandas as pd
import difflib
import nltk
from collections import Counter

# === Configurațiile căilor ===
files = [
    "textFooler/textfooler_results_all.csv",
    "PWWS/pwws_results_all.csv",
    "textBugger/textbugger_results_all.csv",
]

# 1) Încarcă și concatenează
dfs = []
for path in files:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Nu am găsit fișierul: {path}")
    df = pd.read_csv(path)
    dfs.append(df)
results = pd.concat(dfs, ignore_index=True)

# 2) Păstrează doar succesele
results = results[results["result_type"] == "Successful"].copy()

# 3) Curăță marcajele [[…]]
def clean_markers(text):
    return re.sub(r"\[\[(.*?)\]\]", r"\1", text)

results["orig_clean"] = results["original_text"].apply(clean_markers)
results["pert_clean"] = results["perturbed_text"].apply(clean_markers)

# 4) Funcție care extrage perechi de cuvinte înlocuite
def diff_pairs(orig, pert):
    o, p = orig.split(), pert.split()
    diff = list(difflib.ndiff(o, p))
    pairs = []
    i = 0
    while i < len(diff):
        if diff[i].startswith("- "):
            o_word = diff[i][2:]
            # găsește următoarea adăugare
            j = i + 1
            while j < len(diff) and not diff[j].startswith("+ "):
                j += 1
            if j < len(diff):
                p_word = diff[j][2:]
                pairs.append((o_word, p_word))
                i = j
        i += 1
    return pairs

results["diff_pairs"] = results.apply(
    lambda row: diff_pairs(row["orig_clean"], row["pert_clean"]), axis=1
)
results["num_changes"] = results["diff_pairs"].apply(len)

# 5) Statistici
#   a) medie de schimbări per exemplu
avg_changes = results["num_changes"].mean()

#   b) top 10 perechi de înlocuire
all_pairs = [pr for sub in results["diff_pairs"] for pr in sub]
top10 = Counter(all_pairs).most_common(10)

#   c) distribuție POS pentru cuvintele originale
orig_words = [o for o,_ in all_pairs]
# Asigură-te că ai descărcat tagger-ul anterior:
# nltk.download("averaged_perceptron_tagger")
pos_tags = nltk.pos_tag(orig_words)
pos_counts = Counter(tag for _, tag in pos_tags).most_common()

# 6) Afișează rezultatele
print(f"\n→ Average number of word changes per example: {avg_changes:.2f}\n")

print("→ Top 10 replacement pairs (orig → new):")
print("Count | Original → New")
print("-"*30)
for (o,p),cnt in top10:
    print(f"{cnt:5d} | {o} → {p}")
print()

print("→ POS distribution of replaced (original) words:")
print("Count | POS tag")
print("-"*25)
for tag, cnt in pos_counts:
    print(f"{cnt:5d} | {tag}")



→ Average number of word changes per example: 33.91

→ Top 10 replacement pairs (orig → new):
Count | Original → New
------------------------------
  293 | said → aforesaid
  261 | said → stated
  239 | The → Both
  177 | I → me
  173 | know → recognise
  162 | new → novel
  162 | The → Per
  154 | said. → aforesaid.
  149 | The → he
  148 | said → enunciate

→ POS distribution of replaced (original) words:
Count | POS tag
-------------------------
57589 | NN
42459 | NNP
34030 | JJ
17049 | NNS
14758 | VBD
10966 | VBG
10542 | RB
 9604 | VBP
 6153 | VBN
 5391 | VB
 4606 | VBZ
 4222 | IN
 4156 | CD
 2025 | PRP
 1855 | DT
 1413 | MD
  779 | JJS
  569 | JJR
  490 | NNPS
  396 | POS
  327 | RBR
  316 | CC
  231 | PRP$
  154 | FW
  148 | RP
  123 | TO
  122 | WRB
  118 | WP
   50 | EX
   35 | WDT
   34 | WP$
   22 | RBS
   17 | ''
   13 | PDT
   11 | $
