In [119]:
import re

text = '''After World War II, the British greatly reduced the use of the full stop and other punctuation points after abbreviations in at least semi-formal writing, while the Americans more readily kept such use until more recently, and still maintain it more than Britons. The classic example, considered by their American counterparts quite curious, was the maintenance of the internal comma in a British organisation of secret agents called the "Special Operations, Executive", "S.O., E", which is not found in histories written after about 1960.
But before that, many Britons were more scrupulous at maintaining the French form. In French, the period only follows an abbreviation if the last letter in the abbreviation is not the last letter of its antecedent: "M." is the abbreviation for "monsieur" while "Mme" is that for "madame". Like many other cross-channel linguistic acquisitions, many Britons readily took this up and followed this rule themselves, while the Americans took a simpler rule and applied it rigorously.
Over the years, however, the lack of convention in some style guides has made it difficult to determine which two-word abbreviations should be abbreviated with periods and which should not. The U.S. media tend to use periods in two-word abbreviations like United States (U.S.), but not personal computer (PC) or television (TV). Many British publications have gradually done away with the use of periods in abbreviations.
Minimization of punctuation in typewritten material became economically desirable in the 1960s and 1970s for the many users of carbon-film ribbons since a period or comma consumed the same length of non-reusable expensive ribbon as did a capital letter.
Widespread use of electronic communication through mobile phones and the Internet during the 1990s allowed for a marked rise in colloquial abbreviation. This was due largely to increasing popularity of textual communication services such as instant- and text messaging. SMS, for instance, supports message lengths of 160 characters at most (using the GSM 03.38 character set). This brevity gave rise to an informal abbreviation scheme sometimes called Textese, with which 10% or more of the words in a typical SMS message are abbreviated. More recently Twitter, a popular social networking service, began driving abbreviation use with 140 character message limits.'''


# توکن بندی با علامت

words = re.findall(r"\w+\.\w+\.\w?|\d+\.+\d+|\w+|.\s|\.",text)
print(words)

['After', 'World', 'War', 'II', ', ', 'the', 'British', 'greatly', 'reduced', 'the', 'use', 'of', 'the', 'full', 'stop', 'and', 'other', 'punctuation', 'points', 'after', 'abbreviations', 'in', 'at', 'least', 'semi', 'formal', 'writing', ', ', 'while', 'the', 'Americans', 'more', 'readily', 'kept', 'such', 'use', 'until', 'more', 'recently', ', ', 'and', 'still', 'maintain', 'it', 'more', 'than', 'Britons', '. ', 'The', 'classic', 'example', ', ', 'considered', 'by', 'their', 'American', 'counterparts', 'quite', 'curious', ', ', 'was', 'the', 'maintenance', 'of', 'the', 'internal', 'comma', 'in', 'a', 'British', 'organisation', 'of', 'secret', 'agents', 'called', 'the', 'Special', 'Operations', ', ', 'Executive', ', ', 'S.O.', ', ', 'E', ', ', 'which', 'is', 'not', 'found', 'in', 'histories', 'written', 'after', 'about', '1960', '.\n', 'But', 'before', 'that', ', ', 'many', 'Britons', 'were', 'more', 'scrupulous', 'at', 'maintaining', 'the', 'French', 'form', '. ', 'In', 'French', ', '

In [120]:
# stemming نرمال سازی و  

pattern = r'[^a-zA-z0-9\s]'
sentence = re.sub(pattern, '', text)
sentence = sentence.lower()

contractions_dict = { "ain’t": "are not", "’s":" is", "aren’t": "are not", "can’t": "cannot", "can’t’ve": "cannot have", "‘cause": "because", "could’ve": "could have", "couldn’t": "could not", "couldn’t’ve": "could not have", "didn’t": "did not", "doesn’t": "does not", "don’t": "do not", "hadn’t": "had not", "hadn’t’ve": "had not have", "hasn’t": "has not", "haven’t": "have not", "he’d": "he would", "he’d’ve": "he would have", "he’ll": "he will", "he’ll’ve": "he will have", "how’d": "how did",
"they’ll’ve": "they will have", "they’re": "they are", "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not", "we’d": "we would", "we’d’ve": "we would have", "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are", "we’ve": "we have", "weren’t": "were not","what’ll": "what will", "what’ll’ve": "what will have", "what’re": "what are", "what’ve": "what have", "when’ve": "when have", "where’d": "where did", "where’ve": "where have",
"who’ll": "who will", "who’ll’ve": "who will have", "who’ve": "who have", "why’ve": "why have", "will’ve": "will have", "won’t": "will not", "won’t’ve": "will not have", "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have", "y’all": "you all", "y’all’d": "you all would", "y’all’d’ve": "you all would have", "y’all’re": "you all are", "y’all’ve": "you all have", "you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will", "you’ll’ve": "you will have", "you’re": "you are", "you’ve": "you have"}

words = re.findall(r"\d+\.+\d+|\d+s|\w+\.\w+\.\w?|\w+",text)
contractions_re = re.compile('(%s)'%'|'.join(contractions_dict.keys()))
def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)
sentence = expand_contractions(sentence)

In [121]:
class PorterStemmer:
    
    def isCons(self, letter):
        letter = letter.lower()
        if letter == 'a' or letter == 'e' or letter == 'i' or letter == 'o' or letter == 'u':
            return False
        else:
            return True
        
    def isvowel(self, letter):
        return (not self.isCons(letter))
    

    # *S
    def endsWith(self, stem):
        if re.findall(r'(\w+)s\b',stem):
            return True
        else:
            return False

    # *v*
    def containsVowel(self, stem):
        for i in range(0, len(stem)):
            if self.isvowel(stem[i]):
                return True
        return False

    # *d
    def doubleCons(self, stem):
        form = self.getForm(stem)
        if form[-1] == 'C' and form[-2] == 'C':
            return True
        else:
            return False

    def getForm(self, word):
        form = []
        formStr = ''
        for i in range(0 , len(word)):
            if self.isvowel(word[i]):
                form.append('V')
            else:
                form.append('C')
        for j in form:
            formStr += j
        return formStr

    def getM(self, word):
        form = self.getForm(word)
        m = form.count('VC')
        return m
    
    def get_base(self, word, suf):
        suflen = word.rfind(suf)
        base = word[:suflen]
        return base
    
    # *o
    def cvc(self, stem):
        stem = stem.lower()
        form = self.getForm(stem)
        if form[-1] == 'C' and form[-2] == 'V' and form[-3] == 'C' and stem[-1] not in 'xyz':
            return True
        else:
            return False

    def replaceM0(self, orig, rem, rep):

        result = orig.rfind(rem)
        base = orig[:result]
        if self.getM(base) >= 0:
            replaced = base + rep
            return replaced
        else:
            return orig

    def replaceM1(self, orig, rem, rep):

        result = orig.rfind(rem)
        base = orig[:result]
        if self.getM(base) > 1:
            replaced = base + rep
            return replaced
        else:
            return orig

    def step1a(self, word):

        if word.endswith('sses'):
            word = re.sub('sses', 'ss', word)
        elif word.endswith('ies'):
            word = re.sub('ies', 'i', word)
        elif word.endswith('ss'):
            word = re.sub('ss', 'ss', word)
        elif word.endswith('s'):
            word = re.sub('s', '', word)
        else:
            pass
        return word

    def step1b(self, word):
 
        flag = False
        worrd = word
        if word.endswith('eed'):
            result = len('eed')
            base = word[:result]
            if self.getM(base) > 0:
                worrd = re.sub('eed', 'ee',word)
        elif word.endswith('ed'):
            result = word.rfind('ed')
            base = word[:result]
            if self.containsVowel(base):
                worrd = base
                worrd = self.part_1b(worrd)
                flag = True
        elif word.endswith('ing'):
            result = word.rfind('ing')
            base = word[:result]
            if self.containsVowel(base):
                word = base
                flag = True
        return (worrd)
    def part_1b(self, word):    
        if re.findall(r'(\w+)(bl|at|iz\b)',word):
            word += 'e'
        elif self.doubleCons(word) and not re.findall(r'(\w+)s\b',word) and not re.findall(r'(\w+)z\b',word) and not re.findall(r'(\w+)l\b',word):
            word = word[:-1]
        elif self.getM(word) == 1 and self.cvc(word):
            word += 'e'
        return (word)
    
    def step1c(self, word):
        
        if word.endswith('y'):
            result = word.rfind('y')
            base = word[:result]
            if self.containsVowel(base):
                word = base
                word += 'i'
        return word

    def step2(self, word):

        if word.endswith('ational'):
            word = self.replaceM0(word, 'ational', 'ate')
        elif word.endswith('tional'):
            word = self.replaceM0(word, 'tional', 'tion')
        elif word.endswith('enci'):
            word = self.replaceM0(word, 'enci', 'ence')
        elif word.endswith('anci'):
            word = self.replaceM0(word, 'anci', 'ance')
        elif word.endswith('izer'):
            word = self.replaceM0(word, 'izer', 'ize')
        elif word.endswith('abli'):
            word = self.replaceM0(word, 'abli', 'able')
        elif word.endswith('alli'):
            word = self.replaceM0(word, 'alli', 'al')
        elif word.endswith('entli'):
            word = self.replaceM0(word, 'entli', 'ent')
        elif word.endswith('eli'):
            word = self.replaceM0(word, 'eli', 'e')
        elif word.endswith('ousli'):
            word = self.replaceM0(word, 'ousli', 'ous')
        elif word.endswith('ization'):
            word = self.replaceM0(word, 'ization', 'ize')
        elif word.endswith('ation'):
            word = self.replaceM0(word, 'ation', 'ate')
        elif word.endswith('ator'):
            word = self.replaceM0(word, 'ator', 'ate')
        elif word.endswith('alism'):
            word = self.replaceM0(word, 'alism', 'al')
        elif word.endswith('iveness'):
            word = self.replaceM0(word, 'iveness', 'ive')
        elif word.endswith('fulness'):
            word = self.replaceM0(word, 'fulness', 'ful')
        elif word.endswith('ousness'):
            word = self.replaceM0(word, 'ousness', 'ous')
        elif word.endswith('aliti'):
            word = self.replaceM0(word, 'aliti', 'al')
        elif word.endswith('iviti'):
            word = self.replaceM0(word, 'iviti', 'ive')
        elif word.endswith('biliti'):
            word = self.replaceM0(word, 'biliti', 'ble')
        return word

    def step3(self, word):

        if word.endswith('icate'):
            word = self.replaceM0(word, 'icate', 'ic')
        elif word.endswith('ative'):
            word = self.replaceM0(word, 'ative', '')
        elif word.endswith('alize'):
            word = self.replaceM0(word, 'alize', 'al')
        elif word.endswith('iciti'):
            word = self.replaceM0(word, 'iciti', 'ic')
        elif word.endswith('ful'):
            word = self.replaceM0(word, 'ful', '')
        elif word.endswith('ness'):
            word = self.replaceM0(word, 'ness', '')
        return word

    def step4(self, word):

        if word.endswith('al'):
            word = self.replaceM1(word, 'al', '')
        elif word.endswith('ance'):
            word = self.replaceM1(word, 'ance', '')
        elif word.endswith('ence'):
            word = self.replaceM1(word, 'ence', '')
        elif word.endswith('er'):
            word = self.replaceM1(word, 'er', '')
        elif word.endswith('ic'):
            word = self.replaceM1(word, 'ic', '')
        elif word.endswith('able'):
            word = self.replaceM1(word, 'able', '')
        elif word.endswith('ible'):
            word = self.replaceM1(word, 'ible', '')
        elif word.endswith('ant'):
            word = self.replaceM1(word, 'ant', '')
        elif word.endswith('ement'):
            word = self.replaceM1(word, 'ement', '')
        elif word.endswith('ment'):
            word = self.replaceM1(word, 'ment', '')
        elif word.endswith('ent'):
            word = self.replaceM1(word, 'ent', '')
        elif word.endswith('ou'):
            word = self.replaceM1(word, 'ou', '')
        elif word.endswith('ism'):
            word = self.replaceM1(word, 'ism', '')
        elif word.endswith('ate'):
            word = self.replaceM1(word, 'ate', '')
        elif word.endswith('iti'):
            word = self.replaceM1(word, 'iti', '')
        elif word.endswith('ous'):
            word = self.replaceM1(word, 'ous', '')
        elif word.endswith('ive'):
            word = self.replaceM1(word, 'ive', '')
        elif word.endswith('ize'):
            word = self.replaceM1(word, 'ize', '')
        elif word.endswith('ion'):
            base = self.get_base(word, 'ion')
            if self.getM(base) > 1 and ( re.findall(r'(\w+)s\b',base) or re.findall(r'(\w+)t\b',base)):
                word = self.replaceM1(word, 'ion', '')
        return word



    def step5a(self, word):

        if word.endswith('e'):
            base = self.get_base(word, 'e')
            if self.getM(base) > 1:
                word = base
            elif  self.getM(base) > 0 or self.getM(base) == 1 and not self.cvc(base):
                word = base
        return word

    def step5b(self, word):

        if self.getM(word) > 1 and self.doubleCons(word) and self.endsWith('l'):
            word = word[:-1]
        return word


p = PorterStemmer()
for i in words:
    if len(i) > 2:
        word = p.step1a(i)
        word = p.step1b(word)
        word = p.step1c(word)
        word = p.step2(word)
        word = p.step3(word)
        word = p.step4(word)
        word = p.step5a(word)
        word = p.step5b(word)
        print(word,end=" ")

After World War the British greatli reduc the us the full stop and other punctuat point after abbrevi least semi formal writing whil the American mor readili kept such us until mor recent and still maintain mor than Briton The classic exampl consid their American counterpart quit curiou wa the mainten the intern comma British organis secret agent call the Special Oper Execut S.O. which not found histori written after about 1960 But befor that mani Briton wer mor crupul maintaining the French form French the period onli follow abbrevi the last letter the abbrevi not the last letter it anteced the abbrevi for monsieur whil Mme that for madam Lik mani other cross channel linguist acquiit mani Briton readili took thi and follow thi rul themelv whil the American took simpler rul and appli rigor Over the year howev the lack convent som style guid ha mad difficult determin which two word abbrevi should abbrevi with period and which should not The U.S. media tend us period two word abbrevi lik