In [66]:
import unicodedata
from collections import defaultdict

from gensim.models import KeyedVectors
from scipy import spatial

In [24]:
def process_text(text):
    titles, contents = [], []
    for i in range(len(text)):
        if text[i][:3] == "###":
            titles.append(text[i].strip())
            contents.append(text[i+1].strip())
    return titles, contents

In [96]:
def find_closed_marks(text, start, stop, max_len):
    if len(text[start]) > 6 and text[start][:3] == "'''" and text[stop][-3:] == "'''":
        return start, stop
    else:
        while start < min(len(text) - 1, max_len):
            if len(text[start]) > 3 and text[start][:3] == "'''":
                break
            start += 1
        while stop < min(len(text) - 1, max_len):
            if len(text[stop]) > 3 and text[stop][-3:] == "'''":
                break
            stop += 1
        if len(text[start]) > 3 and text[start][:3] == "'''" and len(text[stop]) > 3 and text[stop][-3:] == "'''":
            return start, stop
    return None

def strip_unwanted_punctuation(text):
    unwanted = [",", ")", "("]
    stripped = []
    for word in text:
        if word:
            start = 0
            while start < len(word) and word[start] in unwanted:
                start += 1
            stop = len(word) - 1
            while stop > 1 and word[stop] in unwanted:
                stop -= 1
            if stop - start > 0:
                stripped.append(word[start:(stop+1)])
    return stripped

def strip_all_punctuation(text):
    stripped = []
    for word in text:
        if word:
            start = 0
            while start < len(word) and unicodedata.category(word[start])[0] == 'P' :
                start += 1
            stop = len(word) - 1
            while stop > 1 and unicodedata.category(word[stop])[0] == 'P':
                stop -= 1
            if stop - start > 0:
                stripped.append(word[start:(stop+1)])
    return stripped

In [9]:
def find_synonyms_1(text):
    results = []
    for idx in range(1, min(len(text), 10)):
        if text[idx] in ["lub", "synonimy", "właściwie", "potocznie"] and len(text) > idx + 1:
            res = find_closed_marks(text, idx+1, idx+1, 10)
            if res is not None:
                results.append(text[res[0]:res[1]+1])
    return results

def find_synonyms_2(text):
    results = []
    for idx in range(1, min(len(text), 10)):
        if text[idx][-1] == "," and len(text) > idx + 1:
            res = find_closed_marks(text, idx+1, idx+1, 10)
            if res is not None:
                results.append(text[res[0]:res[1]+1])
    return results

## Lista 1

In [25]:
with open("/home/maria/Documents/ET/List_5/data/poczatki_wikipediowe.txt", "r") as f:
    text = f.readlines()

titles, contents = process_text(text)

In [31]:
synonyms = defaultdict(list)
for title, content in zip(titles, contents):
    clean_content = strip_unwanted_punctuation(content.split(" "))
    synonyms_1 = find_synonyms_1(clean_content)
    synonyms_2 = find_synonyms_2(clean_content)
    synonyms[title].extend(synonyms_1)
    synonyms[title].extend(synonyms_2)

In [36]:
with open("/home/maria/Documents/ET/List_5/other/output_1.txt", "w") as f:
    for k, v in synonyms.items():
        if v:
            v = strip_all_punctuation([" ".join(s) for s in v])
            filtered = list(set([k[4:]] + v))
            if len(filtered) > 1:
                f.write(" # ".join(filtered))
                f.write('\n')

## Lista 5

In [140]:
def find_synonyms_1(text):
    results = []
    for idx in range(1, min(len(text), 30)):
        if text[idx] in ["lub", "synonimy", "właściwie", "potocznie", "też", "również"] and len(text) > idx + 1:
            res = find_closed_marks(text, idx+1, idx+1, 10)
            # only 1 word
            if res is not None and res[0] == res[1]:
                results.append(text[res[0]:res[1]+1])
    return results

def find_synonyms_2(text):
    results = []
    for idx in range(1, min(len(text), 30)):
        if text[idx][-1] == "," and len(text) > idx + 1:
            res = find_closed_marks(text, idx+1, idx+1, 10)
            if res is not None and res[0] == res[1]:
                results.append(text[res[0]:res[1]+1])
    return results

def find_synonyms_3(text):
    results = []
    for idx in range(1, min(len(text) - 1, 30)):
            res = find_closed_marks(text, idx+1, idx+1, 10)
            if res is not None and res[0] == res[1]:
                results.append(text[res[0]:res[1]+1])
    return results

def find_synonyms_embeddings(title, text, embeddings):
    try:
        title_embedding = embeddings[title.lower()]
    except:
        return []
    synonyms_1 = find_synonyms_1(text)
    synonyms_2 = find_synonyms_2(text)
    synonyms_3 = find_synonyms_3(text)
    synonyms = synonyms_1 + synonyms_2 + synonyms_3
    synonyms = set(sum(synonyms, []))
    close_synonyms = []
    for synonym in synonyms:
        cleaned_synonym = strip_all_punctuation([synonym.lower()])
        try:
            if title.lower() != cleaned_synonym[0]:
                    synonym_embedding = embeddings[cleaned_synonym[0]]
                    dist = spatial.distance.cosine(title_embedding, synonym_embedding)
                    if dist < 0.5:
                        print(title)
                        print(cleaned_synonym)
                        close_synonyms.append(cleaned_synonym)
        except:
            continue
    return close_synonyms

In [141]:
word2vec = KeyedVectors.load("/home/maria/Documents/ET/List_5/word2vec/word2vec_100_3_polish.bin")

In [142]:
synonyms = defaultdict(list)
for title, content in zip(titles, contents):
    splitted_title = title.split()
    if len(splitted_title) == 2:
        clean_content = strip_unwanted_punctuation(content.split(" "))
        synonyms[title].extend(find_synonyms_embeddings(splitted_title[1], clean_content, word2vec))

Polonezköy
['adampol']
Alergia
['nadwrażliwość']
Alergia
['uczulenie']
Astma
['dychawica']
Akwaforta
['kwasoryt']
Abraham
['אברהם']
Ajschylos
['eschyl']
Bekerel
['bq']
Cydr
['jabłecznik']
Cent
['eurocent']
Dionizos
['bachus']
Dżinizm
['dżainizm']
Dżinizm
['dżajnizm']
Drawidowie
['drawidzi']
Emotikon
['emotikona']
Epos
['epopeja']
Epistemologia
['gnoseologia']
Glicyna
['gly']
Grej
['gy']
Holocen
['aluwium']
JHWH
['יהוה']
Guomindang
['kuomintang']
Kaloria
['kilokaloria']
Komiacy
['zyrianie']
Komiacy
['komi']
LaTeX
['tex']
Mieszaniec
['hybryd']
Mieszaniec
['hybryda']
Monoid
['monoidem']
Mendog
['mindowe']
Negacja
['zaprzeczenie']
Natura
['przyroda']
Okulistyka
['oftalmologia']
Ortopedia
['traumatologia']
Ontologia
['metafizyka']
Procesor
['cpu']
Podprogram
['funkcja']
Podprogram
['procedura']
Płanetnik
['chmurnik']
Perperuna
['dodola']
Planetoida
['asteroida']
Ptolemeusze
['lagidzi']
Powstanie
['insurekcja']
Równoważność
['ekwiwalencja']
Rower
['bicykl']
Rower
['rover']
Rower
['welocyped'

In [144]:
with open("/home/maria/Documents/ET/List_5/other/output_2.txt", "w") as f:
    for k, v in synonyms.items():
            if v:
                v = strip_all_punctuation([" ".join(s) for s in v])
                filtered = list(set([k[4:]] + v))
                if len(filtered) > 1:
                    f.write(" # ".join(filtered))
                    f.write('\n')