In [1]:
import polars as pl

df = pl.read_csv('eng.derivational.v1.tsv', separator='\t', has_header=False, schema={"source": pl.String, "target": pl.String, "source POS": pl.String, "target POS": pl.String, "affix": pl.String, "type": pl.String})

In [2]:
df = df.select(pl.col('target').str.to_lowercase(), pl.col('source').str.to_lowercase(), pl.col('affix').str.to_lowercase().str.replace('#etymology_\\d', ''), 'type')
df = df.filter((pl.col('target') != pl.col('source')) & ~pl.col('target').str.contains('[^a-z]') & ~pl.col('source').str.contains('[^a-z]'))

In [None]:
min_prefix_count = 5
prefixes = df.filter((pl.col('type') == 'prefix') & (pl.col('affix').str.len_chars() > 1) & ~pl.col('affix').str.contains('[^a-z]')).select(pl.col('affix').alias('prefix')).group_by('prefix').agg(pl.len()).filter(pl.col('len') >= min_prefix_count).select('prefix').sort(by='prefix')
prefixes.write_csv('prefixes.csv')

In [None]:
min_suffix_count = 5
suffixes = df.filter((pl.col('type') == 'suffix') & ~pl.col('affix').str.contains('[^a-z]')).select(pl.col('affix').alias('suffix')).group_by('suffix').agg(pl.len()).filter(pl.col('len') >= min_suffix_count).select('suffix')
suffixes = pl.concat([suffixes, 
                     suffixes.filter(pl.col('suffix').str.tail(1).is_in(('e', 'y', 'i'))).select(pl.col('suffix').str.head(-1)),
                     suffixes.filter(pl.col('suffix').str.tail(1) == 'y').select(pl.col('suffix').str.head(-1) + 'i')
                    ]).filter((pl.col('suffix').str.len_chars() > 1) | (pl.col('suffix').is_in(('y', 's', 'i')))).unique().sort(by='suffix')
suffixes.write_csv('suffixes.csv')

In [6]:
df = df.filter(pl.when(pl.col('type') == 'prefix').then(pl.col('affix').is_in(prefixes)).otherwise(pl.col('affix').is_in(suffixes)))

In [7]:
forbidden_words = ['is']
df = df.filter(~pl.col('target').is_in(forbidden_words))

In [8]:
cur = df.select('target', pl.col('source').alias('stem'), pl.when(pl.col('type') == 'prefix').then(pl.col('affix')).otherwise(pl.lit(None)).alias('prefix'), pl.when(pl.col('type') == 'suffix').then(pl.col('affix')).otherwise(pl.lit(None)).alias('suffix'))
for _ in range(10):
    cur = cur.join(df.filter(pl.col('type') == 'suffix').rename({'target': 'stem'}), on='stem', how='left').group_by('target').agg(pl.all().last()).select('target', pl.when(pl.col('affix').is_null()).then(pl.col('stem')).otherwise(pl.col('source')).alias('stem'), 'prefix', pl.concat_str(pl.col('affix'), pl.col('suffix'), separator='|', ignore_nulls=True).alias('suffix')).with_columns(suffix=pl.when(pl.col('suffix').str.len_chars() == 0).then(None).otherwise(pl.col('suffix')))
    cur = cur.join(df.filter(pl.col('type') == 'prefix').rename({'target': 'stem'}), on='stem', how='left').group_by('target').agg(pl.all().last()).select('target', pl.when(pl.col('affix').is_null()).then(pl.col('stem')).otherwise(pl.col('source')).alias('stem'), pl.concat_str(pl.col('prefix'), pl.col('affix'), separator='|', ignore_nulls=True), 'suffix').with_columns(prefix=pl.when(pl.col('prefix').str.len_chars() == 0).then(None).otherwise(pl.col('prefix')))

cur = cur.filter(~pl.col('stem').is_in(df['target']))
cur

target,stem,prefix,suffix
str,str,str,str
"""antichlorotic""","""chlorosis""","""anti""","""ic"""
"""stalkiness""","""stalk""",,"""y|ness"""
"""worldlike""","""world""",,"""like"""
"""ketoglutaric""","""glutaric""","""keto""",
"""gamification""","""game""",,"""ification"""
…,…,…,…
"""brazenness""","""brass""",,"""en|ness"""
"""thermoclinic""","""thermocline""",,"""ic"""
"""polyornithine""","""ornithine""","""poly""",
"""sublegal""","""legal""","""sub""",


In [None]:
#cur.sort(by='target').write_csv('dictionary.csv')

In [10]:
cur.sort(by=pl.col('prefix').str.len_chars()).tail(10)

target,stem,prefix,suffix
str,str,str,str
"""pentahydroxyanthraquinone""","""quinone""","""penta|hydroxy|anthra""",
"""quasihemidemisemiquaver""","""quave""","""quasi|hemi|demi|semi""","""er"""
"""radioimmunoelectrophoresis""","""phoresis""","""radio|immuno|electro""",
"""tetrahydroxyanthraquinone""","""quinone""","""tetra|hydroxy|anthra""",
"""lysoglycerophospholipid""","""lipid""","""lyso|glycero|phospho""",
"""heptahydroxyanthraquinone""","""quinone""","""hepta|hydroxy|anthra""",
"""electromagnetohydrodynamics""","""dynamic""","""electro|magneto|hydro""","""s"""
"""electromagnetohydrodynamic""","""dynamic""","""electro|magneto|hydro""",
"""hydroxycyclophosphamide""","""ammonia""","""hydroxy|cyclo|phospho""","""ide"""
"""counterimmunoelectrophoresis""","""phoresis""","""counter|immuno|electro""",


In [15]:
cur.sort(by=pl.col('suffix').str.len_chars()).tail(10)

target,stem,prefix,suffix
str,str,str,str
"""differentiatedly""","""differ""",,"""ent|ence|ial|ate|ed|ly"""
"""scintigraphically""","""scintilla""",,"""ate|ion|graphy|ic|ally"""
"""existentialistically""","""exist""",,"""ent|ence|ial|istic|ally"""
"""differentiational""","""differ""",,"""ent|ence|ial|ate|ion|al"""
"""differentiatedness""","""differ""",,"""ent|ence|ial|ate|ed|ness"""
"""differentiably""","""differ""",,"""ent|ence|ial|ate|able|ly"""
"""ultradifferentiability""","""differ""","""ultra""","""ent|ence|ial|ate|able|ity"""
"""differentiability""","""differ""",,"""ent|ence|ial|ate|able|ity"""
"""undifferentiability""","""differ""","""un""","""ent|ence|ial|ate|able|ity"""
"""nondifferentiability""","""differ""","""non""","""ent|ence|ial|ate|able|ity"""


In [None]:
affixes = df['affix'].value_counts().sort(by='count', descending=True)
affixes

affix,count
str,u32
"""ly""",13565
"""un""",10677
"""ness""",9505
"""non""",9354
"""er""",7911
…,…
"""ense""",1
"""pupillo""",1
"""uran""",1
"""dromo""",1
