In [1]:
import string
import numpy as np
import re

In [50]:
class Segmentator:
    def __init__(self, lexicon=None, filepath=None):
        """Initialize lexicon"""
        if lexicon is not None:
            self.load_lex(lexicon)
        elif filepath is not None:
            self.load_lex_from_file(filepath)
        else:
            self.lexicon = None
    
    def prepare_lex(self, lexicon):
        """Splits every whole into list of words"""
        for i in range(len(lexicon)):
            lexicon[i] = np.array(lexicon[i].split())
        self.lexicon = lexicon
        
    def load_lex_from_file(self, filepath):
        """Loads lexicon from file "lexicon.txt".
        Every whole is store as list of words.        
        """
        self.lexicon = []
        with open("lexicon.txt", "r") as file: 
            for line in file: 
                line = line.split()
                self.lexicon.append(line)
        
    def segment(self, text):
        """Segment text.
        Punctuactions (others than dash between words)
        are treated as separetly segments.
        Words that create whole described 
        in "lexicon.txt" are theated as 
        total segment.
        
        Parameters:
        text str: analyzed text

        Returns:
        segments List[str]: segmented text
        """
        text = self._removeNewlines(text)
        segments = text.split()
        self._split_punctuation(segments)
        if self.lexicon is not None:
            self._merge_wholes(segments)
        self._add_eos(segments)
        return segments
        
    def _removeNewlines(self, text):
        """Remove newlines from text and save breaked words"""
        return text.replace("-\n", "").replace("\n", " ")
    
    def _split_punctuation(self, segments):
        """Split punctions that appear at the end of segments"""
        for i in reversed(range(len(segments))):
            if segments[i][-1] in string.punctuation:
                segments.insert(i+1, segments[i][-1])
                segments[i] = segments[i][:-1]
                
    def _add_eos(self, segments):
        """Add <eos> at the end of a sentence"""
        segments.append("<eos>")
        for i in reversed(range(len(segments)-1)):
            if segments[i][-1] in [".", "!", "?"] and segments[i+1][0].isupper():
                segments.insert(i+1, "<eos>")
    
    def _merge_wholes(self, segments):
        """Merge segments that form a whole
        
        Parameters:
        segments List[str]: segmented text
        """
        segments_to_merge = self._get_segments_to_merge(segments)
        for seg in segments_to_merge:
            start, end, whole = seg.values()
            segments[start] = whole
            del segments[start+1:end]
    
    def _get_segments_to_merge(self, segments):
        """Get wholes from text.
        
        Parameters:
        segments List[str]: segmented text

        Returns:
        segments_to_merge List[int, int, str]: 
            matrix n x 3 with
            every row containing:
            1. whole appearance beginning
            2. whole appearance ending
            3. whole sentence
        """
        segments_to_merge = []
        indices, wholes = self._get_potential_wholes_beginning_indicies(segments)
        for start, whole in zip(indices, wholes):
            end = start + len(whole)
            if (end < len(segments)) and (segments[start:end] == whole):
                segments_to_merge.append({
                    "start": start,
                    "end": end,
                    "whole": " ".join(whole)
                })
        return segments_to_merge
    
    def _get_potential_wholes_beginning_indicies(self, segments):
        """Give indicies where can possibly start wholes in text
        
        Parameters:
        segments List[str]: segmented text

        Returns:
        indicies List[int]: indicies where can start potential wholes
        wholes List[List[str]]: potential wholes (words splited into list)
        """
        indices = []
        wholes = []
        for whole in self.lexicon:
            for i, word in enumerate(segments):
                if word == whole[0]:
                    indices.append(i)
                    wholes.append(whole)
        return indices, wholes
    
    def print_segments(self, segments):
        """Prints segments"""
        for s in segments:
            print(s)


In [51]:
text = None
with open("text.txt") as file:
    text = file.read()
print(text)

Piłka nożna to wspaniały sport.
Biało-czerwoni zajeli swoje pozycje.
Żółta kartka dla napastika należy się.




In [52]:
segmentator = Segmentator(filepath="lexicon.txt")
segments = segmentator.segment(text)
segmentator.print_segments(segments)

Piłka nożna
to
wspaniały
sport
.
<eos>
Biało-czerwoni
zajeli
swoje
pozycje
.
<eos>
Żółta kartka
dla
napastika
należy
się
należy się
<eos>


In [None]:
# Co można jeszcze:
# inż, czy łączyć myślniki czy rozdzielać, daty; podział na strukture dokumentu (akapity, nagłówki)