In [7]:
import json
from collections import Counter
import pandas as pd

# Path to your file
file_path = "data/train_v2.jsonl"  # <-- Change this to your actual file name

# Load all JSON lines into a list of dictionaries
data = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:  # ignore empty lines
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSON error: {e}")

# Convert to a DataFrame for convenience
df = pd.DataFrame(data)

In [8]:
# --- Basic stats ---
total_entries = len(df)
unique_acronyms = df['acronym'].nunique()
acronym_counts = df['acronym'].value_counts()

print(f"📊 Total entries: {total_entries}")
print(f"🔠 Unique acronyms: {unique_acronyms}\n")

print("🧾 Frequency of each acronym:")
print(acronym_counts.to_string())

📊 Total entries: 492
🔠 Unique acronyms: 77

🧾 Frequency of each acronym:
acronym
EF      42
AGC     37
CLE     33
AC      29
EM      24
US      24
PN      21
RFF     21
GID     18
EAS     18
TIV     17
PL      15
LGV     13
DT      11
BHR     11
DE      10
BV       9
CSS      9
NG       9
SAL      8
TR       8
DAAT     7
SE       6
PAR      6
EP       6
SGC      5
RSS      4
CCT      3
PSE      3
PLM      3
CCL      3
MG       2
BAL      2
VO       2
CRC      2
IP       2
ET       2
DRI      2
VL       2
PCD      2
PSL      2
MA       2
TM       2
RC       2
RST      1
SUD      1
DC       1
MP       1
VT       1
OP       1
RGP      1
IN       1
RN       1
PK       1
SLD      1
ZI       1
BFC      1
OA       1
RD       1
TLC      1
BB       1
LM       1
IV       1
TT       1
ST       1
STD      1
GC       1
PRG      1
ATO      1
TIP      1
VU       1
CG       1
LV       1
PAI      1
STI      1
DU       1
CMT      1


In [9]:
# --- Extract how many true options each acronym has ---
true_option_counts = []
for entry in data:
    acronym = entry["acronym"]
    true_count = sum(1 for val in entry["options"].values() if val)
    true_option_counts.append({"acronym": acronym, "true_options": true_count})

true_df = pd.DataFrame(true_option_counts)



In [10]:
# Group by acronym to get total true options per acronym
summary_df = true_df.groupby("acronym")["true_options"].sum().reset_index()
summary_df = summary_df.sort_values("true_options", ascending=False)

print("\n✅ Number of true options per acronym:")
print(summary_df.to_string(index=False))




✅ Number of true options per acronym:
acronym  true_options
     EF            42
    AGC            37
    CLE            33
     EM            24
     US            24
     PN            23
    RFF            21
    GID            18
    EAS            18
    TIV            16
     AC            16
    LGV            13
    BHR            11
     DT            10
     NG             9
    CSS             9
     BV             8
     PL             8
     DE             7
   DAAT             7
    PAR             6
     EP             5
    SAL             4
    RSS             4
    BAL             4
    SGC             3
    PLM             3
    CCT             3
    ATO             2
     MG             2
    PSL             2
     TM             2
     SE             2
     VO             2
     VL             2
     TR             2
    TLC             2
    CRC             2
     ET             2
    PCD             2
     VU             1
     CG             1
    SLD        

In [11]:
# --- Option-level analysis ---
option_true_counts = Counter()
for entry in data:
    for option, val in entry["options"].items():
        if val:
            option_true_counts[option] += 1

option_df = pd.DataFrame(option_true_counts.items(), columns=["Option", "True Count"])
option_df = option_df.sort_values("True Count", ascending=False)

print("\n🏷️ Most frequently true options:")
print(option_df.to_string(index=False))



🏷️ Most frequently true options:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Option  True Count
                       