In [41]:
import numpy as np
import pandas as pd
import spacy
import nltk
import re
from bs4 import BeautifulSoup

In [42]:
data = pd.read_csv(r'D:\ML\Internal DL\NLP\IMDB Dataset.csv')

In [43]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [51]:
class TextPreprocess():
    def __init__(self):
        ##loading nlp object of spacy
        self.nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser"])
        # adding it to nlp object
        self.merge_entities_ = self.nlp.create_pipe("merge_entities")
        self.nlp.add_pipe(self.merge_entities_)
        
        ##removing not, neitherm never from stopwords,
        ##you can check all the spaCy stopwords from https://github.com/explosion/spaCy/blob/master/spacy/lang/en/stop_words.py
        self.nlp.vocab["not"].is_stop = False
        self.nlp.vocab['neither'].is_stop = False
        self.nlp.vocab['never'].is_stop = False
        
    def clean_raw_text(self, text, remove_html=True, clean_dots=True, clean_quotes=True, 
               clean_whitespace=True, convert_lowercase=True):
        """
        Clean the text data.
        text: input raw text data
        remove_html: if True, it removes the HTML tags and gives the only text data. 
        clean_dots: cleans all type of dots to fixed one
        clean_quotes: changes all type of quotes to fixed type like "
        clean_whitespaces: removes 2 or more white spaces
        convert_lowercase: converts text to lower case
        """
        if remove_html:
            # remove HTML
            ##separator=' ' to replace tags with space. othewise, we are getting some unwanted type like
            ## "make these characters come alive.<br /><br />We wish" --> make these characters come alive.We wish (no space between sentences)
            text = BeautifulSoup(text, 'html.parser').get_text(separator=' ')  
            
        # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29
        if clean_dots:
            text = re.sub(r'…', '...', text)
        if clean_quotes:
            text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
            text = re.sub(r'[„“]|(\'\')|(,,)', '"', text)
            text = re.sub(r'[-_]', " ", text)
        if clean_whitespace:
            text = re.sub(r'\s+', ' ', text).strip()
        if convert_lowercase:
            text = text.lower()
        return text
    
    def get_token_list(self, text, get_spacy_tokens=False):
        '''
        gives the list of spacy tokens/word strings
        text: cleaned text
        get_spacy_tokens: if true, it returns the list of spacy token objects
                          else, returns tokens in string format
        '''
        ##nlp object
        doc = self.nlp(text)
        out_tokens = []
        for token in doc:
            if token.ent_type_ == "":
                if not(token.is_punct or token.is_stop):
                    if get_spacy_tokens:
                        out_tokens.append(token)
                    else:
                        out_tokens.append(token.norm_)
        return out_tokens
    
    def get_preprocessed_tokens(self, text, remove_html=True, clean_dots=True, clean_quotes=True, 
               clean_whitespace=True, convert_lowercase=True, get_tokens=True, get_spacy_tokens=False):
        """
        returns the cleaned text
        text: input raw text data
        remove_html: if True, it removes the HTML tags and gives the only text data. 
        clean_dots: cleans all type of dots to fixed one
        clean_quotes: changes all type of quotes to fixed type like "
        clean_whitespaces: removes 2 or more white spaces
        convert_lowercase: converts text to lower case
        get_tokens: if true, returns output after tokenization else after cleaning only.
        get_spacy_tokens: if true, it returns the list of spacy token objects
                          else, returns tokens in string format
        """
        text = self.clean_raw_text(text, remove_html, clean_dots, clean_quotes, clean_whitespace, convert_lowercase)
        if get_tokens:
            text = self.get_token_list(text, get_spacy_tokens)
        return text

In [52]:
preprocessor = TextPreprocess()

In [67]:
###getting tokens in string format
print("RAW Text:")
print()
print(data.review[4])
print('-'*100)
print("Preprocess List of Tokens(string format)")
print()
out = preprocessor.get_preprocessed_tokens(data.review[4])
print(out)
print()
print("Type of each object in above list")
print(type(out[0]))

RAW Text:

Petter Mattei's "Love in the Time of Money" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a variation on the Arthur Schnitzler's play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.<br /><br />The only thing one gets out of all these souls in the picture is the different stages of loneliness each one inhabits. A big city is not exactly the best place in which human relations find sincere fulfillment, as one discerns is the 

In [68]:
###getting tokens in string format
print("RAW Text:")
print()
print(data.review[4])
print('-'*100)
print("Preprocess List of Tokens(spacy token format)")
print()
out = preprocessor.get_preprocessed_tokens(data.review[4], get_spacy_tokens=True)
print(out)
print()
print("Type of each object in above list")
print(type(out[0]))

RAW Text:

Petter Mattei's "Love in the Time of Money" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a variation on the Arthur Schnitzler's play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.<br /><br />The only thing one gets out of all these souls in the picture is the different stages of loneliness each one inhabits. A big city is not exactly the best place in which human relations find sincere fulfillment, as one discerns is the 

##### above one spacy token, you can get all properties of spacy token. check this https://spacy.io/api/token

In [71]:
###getting tokens in string format
print("RAW Text:")
print()
print(data.review[4])
print('-'*100)
print("cleaned text")
print()
out = preprocessor.get_preprocessed_tokens(data.review[4], get_tokens=False)
print(out)

RAW Text:

Petter Mattei's "Love in the Time of Money" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a variation on the Arthur Schnitzler's play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.<br /><br />The only thing one gets out of all these souls in the picture is the different stages of loneliness each one inhabits. A big city is not exactly the best place in which human relations find sincere fulfillment, as one discerns is the 