Download DataFrame with polish bills

In [14]:
!gdown 1lDlTP5D_t3WCLE9udm9TuFt28zcOCL5V

Downloading...
From: https://drive.google.com/uc?id=1lDlTP5D_t3WCLE9udm9TuFt28zcOCL5V
To: /content/df_bills.parquet
  0% 0.00/620k [00:00<?, ?B/s]100% 620k/620k [00:00<00:00, 122MB/s]


In [15]:
import pandas as pd

In [16]:
df = pd.read_parquet('df_bills.parquet')

In [17]:
df.head()

Unnamed: 0,token,count,rank
0,art,83804,1
1,ust,53636,2
2,się,45886,3
3,lub,45800,4
4,poz,45224,5


In [18]:
df.tail()

Unnamed: 0,token,count,rank
54939,alkoholomierza,1,46894
54940,użytą,1,46894
54941,odwadnia,1,46894
54942,odkaża,1,46894
54943,tytoniowej,1,46894


### 8. Install Morfeusz (Binding dla Pythona) and use it to find all words that do not appear in that dictionary

In [8]:
# !pip install morfeusz2
from morfeusz2 import Morfeusz

In [9]:
morfeusz =  Morfeusz(aggl="isolated")

In [10]:
morfeusz.analyse("nosuchword")

[(0, 1, ('nosuchword', 'nosuchword', 'ign', [], []))]

In [19]:
def in_morfeusz(token: str) -> bool:
    return morfeusz.analyse(token)[0][2][2] != 'ign'

In [20]:
df_ign = df[~df['token'].apply(lambda token: in_morfeusz(token))]

### 9. Find 30 words with the highest ranks that do not belong to the dictionary.

In [21]:
df_ign[:30]

Unnamed: 0,token,count,rank
355,późn,1065,355
1409,gmo,298,1408
1953,sww,216,1948
2153,skw,196,2149
2541,ike,162,2538
3327,remediacji,120,3316
3719,ure,103,3709
3899,uke,97,3885
4501,pkwiu,81,4487
4671,udt,77,4648


#### 10. Find 30 random words (i.e. shuffle the words) with 5 occurrences that do not belong to the dictionary.


In [25]:
task_10 = df_ign[df_ign['count'] == 5].sample(30)
task_10

Unnamed: 0,token,count,rank
24280,heptanol,5,23246
23777,sposb,5,23246
24325,winopochodne,5,23246
24144,ami,5,23246
23258,schetyna,5,23246
23862,wlkp,5,23246
23068,kpwig,5,23246
23743,vista,5,23246
24096,agave,5,23246
24307,rialnego,5,23246


#### 11. Use Levenshtein distance and the frequency list, to determine the most probable correction of the words from lists defined in points 8 and 9.

In [45]:
df_unknown = df[~df['token'].apply(lambda token: in_morfeusz(token))]
df_known = df[df['token'].apply(lambda token: in_morfeusz(token))]

assert len(df_unknown) + len(df_known) == len(df)

In [94]:
df_unknown[df['token'] == 'poźn']

  """Entry point for launching an IPython kernel.


Unnamed: 0,token,count,rank
35164,poźn,2,35136


In [102]:
df_known[df['token'] == 'plan']

  """Entry point for launching an IPython kernel.


Unnamed: 0,token,count,rank
1052,plan,400,1054


In [71]:
unknown_tkns = df_unknown.set_index('token')['count'].to_dict()
known_tkns = df_known.set_index('token')['count'].to_dict()
total_tkns = df_known['count'].sum()

In [104]:
assert 'późn' in unknown_tkns
assert 'plan' in known_tkns

In [72]:
def get_token_occurence_probability(token: str):
    return known_tkns[token] / total_tkns if token in known_tkns else 0

get_token_occurence_probability("art")

0.029835697883475443

In [119]:
def one_edit_away(token: str):
    letters = 'aąbcćdeęfghijklmnoópqrstuvwxyzźż'
    splits = [(token[:i], token[i:]) for i in range(len(token) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R  for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def two_edits_away(token: str):
    return (levD2 for levD1 in one_edit_away(token) for levD2 in one_edit_away(levD1))

In [120]:
print(one_edit_away('późn'))
print('późno' in one_edit_away('późn'))

{'póąn', 'pzóźn', 'późen', 'pźón', 'póen', 'róźn', 'vpóźn', 'późgn', 'pówn', 'ppóźn', 'pźźn', 'poźn', 'wóźn', 'późx', 'póćźn', 'póhn', 'póąźn', 'póyźn', 'pnźn', 'pęźn', 'późnó', 'późnr', 'późnv', 'pózn', 'sóźn', 'pyźn', 'późę', 'pólźn', 'pókn', 'późż', 'fpóźn', 'kpóźn', 'póvn', 'pcóźn', 'późz', 'pgóźn', 'późln', 'lpóźn', 'późna', 'późnp', 'późó', 'pmóźn', 'póin', 'póęźn', 'póóźn', 'póź', 'póaźn', 'póqn', 'późąn', 'późnf', 'pórn', 'bpóźn', 'pęóźn', 'póuźn', 'lóźn', 'póxn', 'ópóźn', 'rpóźn', 'tpóźn', 'próźn', 'późkn', 'późnż', 'późw', 'pókźn', 'źpóźn', 'pónźn', 'późj', 'późq', 'późrn', 'późnk', 'qóźn', 'pójn', 'późk', 'żpóźn', 'późą', 'pjóźn', 'póun', 'późnć', 'póon', 'późg', 'paźn', 'aóźn', 'piźn', 'późnę', 'późl', 'pózźn', 'ćpóźn', 'pógźn', 'późon', 'późin', 'uóźn', 'pqóźn', 'późy', 'opóźn', 'późd', 'ópźn', 'mpóźn', 'oóźn', 'pdźn', 'późv', 'pówźn', 'paóźn', 'późfn', 'ipóźn', 'pójźn', 'późp', 'dpóźn', 'pwźn', 'późxn', 'późć', 'pógn', 'prźn', 'pótźn', 'późi', 'pxóźn', 'ęóźn', 'pdóźn', 'p

In [121]:
def get_only_known_tokens(tokens):
    return set(token for token in tokens if token in known_tkns)

In [122]:
get_only_known_tokens(one_edit_away('późn'))

set()

In [124]:
get_only_known_tokens(two_edits_away('późn'))

{'pan', 'paź', 'plan', 'pln', 'pól'}

In [130]:
def get_all_token_corrections(token: str):
    tokens_one_edit_away = one_edit_away(token)
    tokens_two_edits_away = two_edits_away(token)

    return (
        # Firstly, check if token is in known tokens
        get_only_known_tokens(token)
        # Secondly, check tokens with one and two edit distance
        or get_only_known_tokens(tokens_one_edit_away)
        or get_only_known_tokens(tokens_two_edits_away)
        # If cannot find, just return token
        or [token]
    )

In [131]:
def correct_using_most_probable_correction(token: str):
    return max(get_all_token_corrections(token), key=get_token_occurence_probability)

In [132]:
df_corrections = df_unknown.iloc[:30]
df_corrections['correction'] = df_corrections['token'].apply(correct_using_most_probable_correction)
df_corrections

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,token,count,rank,correction
355,późn,1065,355,plan
1409,gmo,298,1408,imo
1953,sww,216,1948,swe
2153,skw,196,2149,sów
2541,ike,162,2538,ile
3327,remediacji,120,3316,mediacji
3719,ure,103,3709,urz
3899,uke,97,3885,ust
4501,pkwiu,81,4487,kwitu
4671,udt,77,4648,ust


In [133]:
# Save df_unknown to parquet and return to VSCode local environment

In [134]:
df_unknown.to_parquet('df_unknown.parquet')