In [None]:
from urllib.request import urlopen
from nltk.corpus import wordnet as wn
import nltk
import spacy
import lemminflect
import inflect
import itertools

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

nlp = spacy.load("en_core_web_sm")
infl = inflect.engine()

OUTPUT = "mwes.txt"

def extract_wordnet_mwes():
    mwes = set()
    for syn in wn.all_synsets():
        for lemma in syn.lemma_names():
            if "_" in lemma:
                mwes.add(lemma.replace("_", " ").lower())
    return mwes

def verb_inflections(lemma):
    out = {}
    base = lemma
    out['VB'] = base

    def get(tag):
        forms = lemminflect.getInflection(base, tag)
        if forms:
            return forms[0]
        return None

    form_vbz = get('VBZ')
    if not form_vbz:
        if base.endswith('y') and len(base) > 1 and base[-2] not in "aeiou":
            form_vbz = base[:-1] + "ies"
        elif base.endswith(("s","x","z","ch","sh")):
            form_vbz = base + "es"
        else:
            form_vbz = base + "s"
    out['VBZ'] = form_vbz

    form_vbd = get('VBD')
    if not form_vbd:
        if base.endswith('e'):
            form_vbd = base + "d"
        elif base.endswith('y') and len(base) > 1 and base[-2] not in "aeiou":
            form_vbd = base[:-1] + "ied"
        else:
            form_vbd = base + "ed"
    out['VBD'] = form_vbd

    form_vbn = get('VBN')
    if not form_vbn:
        form_vbn = out['VBD']
    out['VBN'] = form_vbn

    form_vbg = get('VBG')
    if not form_vbg:
        if base.endswith('ie'):
            form_vbg = base[:-2] + "ying"
        elif base.endswith('e') and not base.endswith('ee'):
            form_vbg = base[:-1] + "ing"
        else:
            form_vbg = base + "ing"
    out['VBG'] = form_vbg

    return out

def noun_plurals(noun):
    p = infl.plural(noun)
    return {noun, p} if p and p != noun else {noun}

def generate_variants_for_mwe(mwe, max_variants_per_mwe=64):
    doc = nlp(mwe)
    token_options = []

    noun_heads = set()
    if all(t.pos_ in ("NOUN", "ADJ", "DET") for t in doc):
        for t in doc:
            if t.pos_ == "NOUN" and t.dep_ == "ROOT":
                noun_heads.add(t.i)

    for i, token in enumerate(doc):
        txt = token.text

        if token.is_punct or token.is_space:
            token_options.append([txt])
            continue

        if token.tag_ == "VB":
            lemma = token.lemma_.lower()
            infls = verb_inflections(lemma)
            opts = [infls['VB'], infls['VBZ'], infls['VBD'], infls['VBN'], infls['VBG']]
            seen = []
            for o in opts:
                if o and o not in seen:
                    seen.append(o)
            token_options.append(seen)

        elif i in noun_heads:
            p = infl.plural(token.text.lower())
            if p:
                token_options.append([token.text.lower(), p])
            else:
                token_options.append([token.text.lower()])

        else:
            token_options.append([txt.lower()])

    variants = set()
    for combo in itertools.product(*token_options):
        s = " ".join([t for t in combo]).strip()
        variants.add(s)
        if len(variants) >= max_variants_per_mwe:
            break
    return variants

def main():
    print("Extracting WordNet MWEs...")
    wn_mwes = extract_wordnet_mwes()
    print(f"WordNet MWEs: {len(wn_mwes)}")

    all_mwes = wn_mwes
    all_mwes = {m.strip().lower() for m in all_mwes if m.strip()}
    print(f"Combined raw MWE count: {len(all_mwes)}")

    final = set()
    count = 0
    for m in sorted(all_mwes):
        count += 1
        if count % 5000 == 0:
            print("Processed:", count)
        try:
            vars_for_m = generate_variants_for_mwe(m)
            final.update(vars_for_m)
        except Exception:
            final.add(m)

    print(f"Total output variants (before sorting): {len(final)}")
    final_list = sorted(final)

    print(f"Writing to {OUTPUT} ...")
    with open(OUTPUT, "w", encoding="utf-8") as f:
        for line in final_list:
            f.write(line + "\n")
    print("Done.")

if __name__ == "__main__":
    main()


Extracting WordNet MWEs...
WordNet MWEs: 64188
Combined raw MWE count: 64188
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Processed: 25000
Processed: 30000
Processed: 35000
Processed: 40000
Processed: 45000
Processed: 50000
Processed: 55000
Processed: 60000
Total output variants (before sorting): 100269
Writing to english_mwes_complete.txt ...
Done.
