In [20]:
!pip install contractions
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer, WordNetLemmatizer
import contractions 	


class Text_preprocessor:
 """
    A class to preprocess text for NLP Application.

    ...

    Attributes
    ----------
    *

    Methods
    -------
    expand_contraction(text=""):
        returns the expanded text.

    remove_special_characters(text="")
        returns text with removed emailaddress, special characters and numbers

    tokenize(text=")
        returns list of words from text
    
    removal_stop_words(token=[],language='english')
        stop words are derived from  nltk.corpus
        returns the list with removed stopwords for english language
    
    stem_or_lem(token=[],method="stemm")
        return the list after lemmitization or stemmization depending upon 
        method argument
    
    preprocessed_text(text="")
        returns list of words after performing
        contaraction, removal of special characters, tokenization, removal of
        stop word, stemmization and lemmitization

      

    """
 def expand_contraction(self, text:str)->str:
    '''
    Expands the words in text with contractions module.

        Parameters
        ----------
        text : str,
            text to be expanded

        Returns
        -------
        text with expanded words 
    '''
    # create an empty list
    expanded_words = []    
    for word in text.split():
      # using contractions.fix to expand the shotened words and removes extra spaces
      expanded_words.append(contractions.fix(word))   
    expanded_text = ' '.join(expanded_words)
    return expanded_text


 def remove_special_characters(self, text:str)->str:
      '''
    Removes the email, special character and numbers from text.

        Special character includes ! @ # $ & * () + -.

        Parameters
        ----------
        text : str
            String containing special character

        Returns
        -------
        String without email, special character and numbers
    '''
      # remove email if any
      txt_email = re.compile(r'[A-Za-z0-9]*@[A-Za-z]*\.com')
      cln_txt = txt_email.sub('', text)
      # remove special character and number if any
      result = re.sub('[^A-Za-z]+', ' ', cln_txt)      
      return result

 def tokenize(self,text:str)->list:
   '''
     Tokenize the text  to form list.
        Use nltk.word_tokenize.

        Parameters
        ----------
        text : str, 
        text to  tokenize
            
        Returns
        -------
        list of tokenized words
    '''

   nltk_tokens = nltk.word_tokenize(text)
   return nltk_tokens


 def removal_stop_words(self,tokens:list, language:str='english')->list:
   '''
    Removes the stop words from list.

        Use stopwords from nltk.corpus.

        Parameters
        ----------
        token : list
            words token
        language : str, optional 
        Language of the words (default is english) 

        Returns
        -------
        list of words without stop words
    '''
   stopword_list = nltk.corpus.stopwords.words(language)
   tokens_without_sw = [word for word in tokens if not word in stopword_list]
   return tokens_without_sw

 def stem_or_lem(self, tokens:list,method:str)->list:
   '''
    Perform Stemming or lemmatization.
    If the argument method is 'stemm' then performs stemmization, performs
    lemmitization if 'lemm' and return tokens for mismatched strings
    PorterStemmer  from nltk for stemming
    WordNetLemmatizer from nltk for lemmatization

        Parameters
        ----------
        tokens : list
           list of tokenized words

        Returns
        -------
        return words after Stemming or Lemmatization
    '''
   #instance of PorterStemmer 
   ps = PorterStemmer()
   stemmed=[]
   lemmed=[]
   if method =='stemm':
    for w in tokens:
        rootWord=ps.stem(w)
        stemmed.append(rootWord)
    return stemmed
   elif method =='lemm':
     wordnet_lemmatizer = WordNetLemmatizer()
     for w in tokens:
        lemm = wordnet_lemmatizer.lemmatize(w)
        lemmed.append(lemm)
     return lemmed
   else:
      return tokens

 def preprocessed_text(self,text:str)->list:
    '''
    Perfoms all the operation of text preprocessing.


        Parameters
        ----------
        text : str, 
            string to be preprocessed
        Returns
        -------
        returns list of words after performing
        contaraction, removal of special characters, tokenization, removal of
        stop word, stemmization and lemmitization

    '''
    exp_text=self.expand_contraction(text)
    prune_special=self.remove_special_characters(exp_text)
    tokenize_words=self.tokenize(prune_special)
    remove_stopwords=self.removal_stop_words(tokenize_words,'english')
    stemmed =self.stem_or_lem(remove_stopwords,'stemm')
    lemmed =self.stem_or_lem(stemmed,'lemmed')
    return lemmed

        

text = '''This movie made it into one of my top
 10 most awful movies. Horrible. I don’t care if it makes 1 million, 
 10 M , or 100. There wasn't a continuous minute where there wasn't
  a fight with one monster or another. There was no chance for any 
  character development, they were too busy running from one sword 
  fight to another. I had no emotional attachment ( except to 
the big bad machine ## that wanted to destroy them). If you 
disagree with me, you can send your thoughts to idonotcare@leavemealone.com'''  
# create an object
obj = Text_preprocessor()
# perform text pre_processing
obj.preprocessed_text(text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['thi',
 'movi',
 'made',
 'one',
 'top',
 'aw',
 'movi',
 'horribl',
 'I',
 'care',
 'make',
 'million',
 'M',
 'there',
 'continu',
 'minut',
 'fight',
 'one',
 'monster',
 'anoth',
 'there',
 'chanc',
 'charact',
 'develop',
 'busi',
 'run',
 'one',
 'sword',
 'fight',
 'anoth',
 'I',
 'emot',
 'attach',
 'except',
 'big',
 'bad',
 'machin',
 'want',
 'destroy',
 'If',
 'disagre',
 'send',
 'thought']

In [18]:
text='idonotcare@leavemealone.com'
required_output=re.sub(r'[A-Za-z0-9]*@[A-Za-z]*\.?[A-Za-z0-9]*', "123", text)
required_output=re.sub(r'[A-Za-z0-9]*@[A-Za-z]*\.com', "123", text)
p = re.compile(r'[A-Za-z0-9]*@[A-Za-z]*\.com')
required_output=p.sub('check', text)
print("no_emails:",required_output)

no_emails: check
