In [None]:
from typing import Any, List, Union

In [None]:
def get_rid_of_dashes(line: str) -> str:
    """Get rid of dashes that starts lines in open subtitles corpora."""
    return line[1:] if line[0] == "-" else line


def get_rid_of_to_short_lines(line: str) -> Union[str, List[Any]]:
    """Remove lines shorter than 3 signs."""
    return line if len(line) > 3 else []

def get_rid_of_cursing(line: str) -> Union[str, List[Any]]:
    """Remove lines that contain vulgar words from corpus."""
    curses = [
        "chuj",
        "jeb",
        "gówno",
        "srać",
        "pizd",
        "szmat",
        "pierdol",
        "cholera",
        "kurw",
        "pieprzy",
        "dup",
        "sukin",
        "suka",
        "zdzira",
        "rucha",
    ]
    for word in line:
        for curse in curses:
            if curse in word.lower():
                return []
    return line



def get_rid_of_names(line: str) -> Union[str, List[Any]]:
    """Remove names of citys/persons from corpora."""
    for word in line[1:]:
        if word[0].isupper():
            return []
    return line


def get_rid_of_ellipsis(line: str) -> Union[str, List[Any]]:
    """Remove unnecessary ellipsises."""
    return [] if line[-1] == "..." else line

def sieve_corpus_with_heuristics(corpus_file_name: str, sieved_corpus: str) -> None:
    """Sieve corpora with heuristics to remove unwanted words and signs."""
    with open(corpus_file_name, "r", encoding="utf8") as corpus, open(
        sieved_corpus, "a", encoding="utf8"
    ) as saving_file:
        for line in corpus:
            line = line.split()
            corpus_line = get_rid_of_dashes(line)
            if not get_rid_of_to_short_lines(corpus_line):
                continue
            if not get_rid_of_cursing(corpus_line):
                continue
            if not get_rid_of_names(corpus_line):
                continue
            if not get_rid_of_ellipsis(corpus_line):
                continue
            saving_file.write(" ".join(corpus_line) + "\n")


In [None]:
def sieve_polimorfologik(
    polimorfologik_path: str, sieved_polimorfologik_path: str
) -> None:
    """Sieve lines in polimorfologik to find all verbs in second person singular form."""
    with open(polimorfologik_path, "r", encoding="utf-8") as polimorfik, open(
        sieved_polimorfologik_path, "a", encoding="utf-8"
    ) as sieved:
        for line in polimorfik:
            line_split = line.split(";")
            atributes = line_split[-1]
            atributes_split = atributes.split(":")
            if not atributes_split[0] == "verb":
                continue
            if "sg" not in atributes_split:
                continue
            if "sec" not in atributes_split:
                continue
            verb_in_sg_sec = line_split[1]
            sieved.write(verb_in_sg_sec + "\n")



def further_sieving_of_polimorfologik(
    file_to_sieve: str, sieved_file_path: str
) -> None:
    """Sieve polimorfologik to remove not male endings."""
    endings_not_wanted = ["łobyś", "łoś", "że"]
    with open(file_to_sieve, "r", encoding="utf-8") as to_sieve, open(
        sieved_file_path, "a", encoding="utf-8"
    ) as sieved:
        for line in to_sieve:
            line = line[:-1]
            if line.endswith(tuple(endings_not_wanted)):
                continue
            sieved.write(line + "\n")

def verbs_in_male_forms(verbs: str, only_male_verbs: str) -> None:
    """Sieve all verbs to find those who are in male form."""
    endings_not_wanted = ["łabyś", "łaś"]
    with open(verbs, "r", encoding="utf-8") as verbs, open(
        only_male_verbs, "a", encoding="utf-8"
    ) as sieved:
        for line in verbs:
            line = line[:-1]
            if line.endswith(tuple(endings_not_wanted)):
                continue
            sieved.write(line + "\n")

