In [59]:
import nltk
import re
import string
from bs4 import BeautifulSoup

In [102]:
class TextNormalizer:
    """
    Text Normalizer class normalizes text
    """
    def __init__(self ):
        self.contractions={
                            "ain't": "is not",
                            "aren't": "are not",
                            "can't": "cannot",
                            "can't've": "cannot have",
                            "'cause": "because",
                            "could've": "could have",
                            "couldn't": "could not",
                            "couldn't've": "could not have",
                            "didn't": "did not",
                            "doesn't": "does not",
                            "don't": "do not",
                            "hadn't": "had not",
                            "hadn't've": "had not have",
                            "hasn't": "has not",
                            "haven't": "have not",
                            "he'd": "he would",
                            "he'd've": "he would have",
                            "he'll": "he will",
                            "he'll've": "he he will have",
                            "he's": "he is",
                            "how'd": "how did",
                            "how'd'y": "how do you",
                            "how'll": "how will",
                            "how's": "how is",
                            "I'd": "I would",
                            "I'd've": "I would have",
                            "I'll": "I will",
                            "I'll've": "I will have",
                            "I'm": "I am",
                            "I've": "I have",
                            "i'd": "i would",
                            "i'd've": "i would have",
                            "i'll": "i will",
                            "i'll've": "i will have",
                            "i'm": "i am",
                            "i've": "i have",
                            "isn't": "is not",
                            "it'd": "it would",
                            "it'd've": "it would have",
                            "it'll": "it will",
                            "it'll've": "it will have",
                            "it's": "it is",
                            "let's": "let us",
                            "ma'am": "madam",
                            "mayn't": "may not",
                            "might've": "might have",
                            "mightn't": "might not",
                            "mightn't've": "might not have",
                            "must've": "must have",
                            "mustn't": "must not",
                            "mustn't've": "must not have",
                            "needn't": "need not",
                            "needn't've": "need not have",
                            "o'clock": "of the clock",
                            "oughtn't": "ought not",
                            "oughtn't've": "ought not have",
                            "shan't": "shall not",
                            "sha'n't": "shall not",
                            "shan't've": "shall not have",
                            "she'd": "she would",
                            "she'd've": "she would have",
                            "she'll": "she will",
                            "she'll've": "she will have",
                            "she's": "she is",
                            "should've": "should have",
                            "shouldn't": "should not",
                            "shouldn't've": "should not have",
                            "so've": "so have",
                            "so's": "so as",
                            "that'd": "that would",
                            "that'd've": "that would have",
                            "that's": "that is",
                            "there'd": "there would",
                            "there'd've": "there would have",
                            "there's": "there is",
                            "they'd": "they would",
                            "they'd've": "they would have",
                            "they'll": "they will",
                            "they'll've": "they will have",
                            "they're": "they are",
                            "they've": "they have",
                            "to've": "to have",
                            "wasn't": "was not",
                            "we'd": "we would",
                            "we'd've": "we would have",
                            "we'll": "we will",
                            "we'll've": "we will have",
                            "we're": "we are",
                            "we've": "we have",
                            "weren't": "were not",
                            "what'll": "what will",
                            "what'll've": "what will have",
                            "what're": "what are",
                            "what's": "what is",
                            "what've": "what have",
                            "when's": "when is",
                            "when've": "when have",
                            "where'd": "where did",
                            "where's": "where is",
                            "where've": "where have",
                            "who'll": "who will",
                            "who'll've": "who will have",
                            "who's": "who is",
                            "who've": "who have",
                            "why's": "why is",
                            "why've": "why have",
                            "will've": "will have",
                            "won't": "will not",
                            "won't've": "will not have",
                            "would've": "would have",
                            "wouldn't": "would not",
                            "wouldn't've": "would not have",
                            "y'all": "you all",
                            "y'all'd": "you all would",
                            "y'all'd've": "you all would have",
                            "y'all're": "you all are",
                            "y'all've": "you all have",
                            "you'd": "you would",
                            "you'd've": "you would have",
                            "you'll": "you will",
                            "you'll've": "you will have",
                            "you're": "you are",
                            "you've": "you have"
                        }
        self.stopwords=nltk.corpus.stopwords.words("english")
    def normalize(self, text, clean=True, rm_stopwords=True, rm_special_chars=True, 
                  expand_conts=True, caseConvert=False, lemmatize=True, stem=True):
        if clean:
            text=self.cleanText(text)
        if expand_conts:
            text=self.expandContractions(text)
        if lemmatize:
            text=self.lemmatizeText(text)
        if stem:
            text=self.stemText(text)
        if rm_stopwords:
            text=self.removeStopwords(text)
        if rm_special_chars:
            text=self.removeSpecialChars(text)
        if caseConvert:
            text=self.caseConvert(text, True)
        else:
            text=self.caseConvert(text, False)
        return text
        
    def caseConvert(self, text, upper=True):
        if upper:
            return text.upper()
        return text.lower()
    
    def cleanText(self, page):
        soup = BeautifulSoup(page)
        fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p')))
        return fetched_text
    
    def tokenizeText(self,text):
        words= nltk.word_tokenize(text)
        tokens= [word.strip() for word in words]
        return tokens 
    
    def removeStopwords(self, text):
        tokens= self.tokenizeText(text)
        filter_tokens=[token for token in tokens if token not in self.stopwords]
        text=' '.join(filter_tokens)
        return text
    
    def removeSpecialChars(self,text):
        tokens=self.tokenizeText(text)
        pattern= re.compile('[{}]'.format(re.escape(string.punctuation)))
        filtered_tokens= filter(None, [pattern.sub('',token) for token in tokens])
        filtered_text= ' '.join(filtered_tokens)
        return filtered_text
    
    def expandContractions(self, text):
        expanded_sentence=""
        words=text.split()
        for i in range(len(words)):
            if words[i] in self.contractions.keys():
                words[i]=self.contractions[words[i]]
        for word in words:
            if word!=words[len(words)-1]:
                expanded_sentence+=word+' '
            else:
                expanded_sentence+=word
        return expanded_sentence
    
    def lemmatizeText(self, text):
        return text
    
    def stemText(self, text):
        return text


In [109]:
tn= TextNormalizer()
help(TextNormalizer)
tn.normalize("<h1><p>My name is Ali Abbas</p></h1>", clean=True, caseConvert=False, rm_stopwords=False)

Help on class TextNormalizer in module __main__:

class TextNormalizer(builtins.object)
 |  Text Normalizer class normalizes text
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  caseConvert(self, text, upper=True)
 |  
 |  cleanText(self, page)
 |  
 |  expandContractions(self, text)
 |  
 |  lemmatizeText(self, text)
 |  
 |  normalize(self, text, clean=True, rm_stopwords=True, rm_special_chars=True, expand_conts=True, caseConvert=False, lemmatize=True, stem=True)
 |  
 |  removeSpecialChars(self, text)
 |  
 |  removeStopwords(self, text)
 |  
 |  stemText(self, text)
 |  
 |  tokenizeText(self, text)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



'my name is ali abbas'