# Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import string
import re
import nltk

# Loading Data

In [4]:
dataset = pd.read_csv('movie_reviews.csv')

reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# Cleaning Text

Removing HTML Tags

In [6]:
from bs4 import BeautifulSoup

In [27]:
def strip_html_tag(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

Remove Accented Characters

In [8]:
import unicodedata

In [13]:
def strip_accents(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

Expanding Contraction

In [15]:
CONTRACTION_MAP = {"ain't": "is not",
                   "aren't": "are not",
                   "can't": "cannot",
                   "can't've": "cannot have",
                   "'cause": "because",
                   "could've": "could have",
                   "couldn't": "could not",
                   "couldn't've": "could not have",
                   "didn't": "did not",
                   "doesn't": "does not",
                   "don't": "do not",
                   "hadn't": "had not",
                   "hadn't've": "had not have",
                   "hasn't": "has not",
                   "haven't": "have not",
                   "he'd": "he would",
                   "he'd've": "he would have",
                   "he'll": "he will",
                   "he'll've": "he will have",
                   "he's": "he is",
                   "how'd": "how did",
                   "how'd'y": "how do you",
                   "how'll": "how will",
                   "how's": "how is",
                   "I'd": "I would",
                   "I'd've": "I would have",
                   "I'll": "I will",
                   "I'll've": "I will have",
                   "I'm": "I am",
                   "I've": "I have",
                   "i'd": "i would",
                   "i'd've": "i would have",
                   "i'll": "i will",
                   "i'll've": "i will have",
                   "i'm": "i am",
                   "i've": "i have",
                   "isn't": "is not",
                   "it'd": "it would",
                   "it'd've": "it would have",
                   "it'll": "it will",
                   "it'll've": "it will have",
                   "it's": "it is",
                   "let's": "let us",
                   "ma'am": "madam",
                   "mayn't": "may not",
                   "might've": "might have",
                   "mightn't": "might not",
                   "mightn't've": "might not have",
                   "must've": "must have",
                   "mustn't": "must not",
                   "mustn't've": "must not have",
                   "needn't": "need not",
                   "needn't've": "need not have",
                   "o'clock": "of the clock",
                   "oughtn't": "ought not",
                   "oughtn't've": "ought not have",
                   "shan't": "shall not",
                   "sha'n't": "shall not",
                   "shan't've": "shall not have",
                   "she'd": "she would",
                   "she'd've": "she would have",
                   "she'll": "she will",
                   "she'll've": "she will have",
                   "she's": "she is",
                   "should've": "should have",
                   "shouldn't": "should not",
                   "shouldn't've": "should not have",
                   "so've": "so have",
                   "so's": "so as",
                   "that'd": "that would",
                   "that'd've": "that would have",
                   "that's": "that is",
                   "there'd": "there would",
                   "there'd've": "there would have",
                   "there's": "there is",
                   "they'd": "they would",
                   "they'd've": "they would have",
                   "they'll": "they will",
                   "they'll've": "they will have",
                   "they're": "they are",
                   "they've": "they have","to've": "to have",
                   "wasn't": "was not",
                   "we'd": "we would",
                   "we'd've": "we would have",
                   "we'll": "we will",
                   "we'll've": "we will have",
                   "we're": "we are",
                   "we've": "we have",
                   "weren't": "were not",
                   "what'll": "what will",
                   "what'll've": "what will have",
                   "what're": "what are",
                   "what's": "what is",
                   "what've": "what have",
                   "when's": "when is",
                   "when've": "when have",
                   "where'd": "where did",
                   "where's": "where is",
                   "where've": "where have",
                   "who'll": "who will",
                   "who'll've": "who will have",
                   "who's": "who is",
                   "who've": "who have",
                   "why's": "why is",
                   "why've": "why have",
                   "will've": "will have",
                   "won't": "will not",
                   "won't've": "will not have",
                   "would've": "would have",
                   "wouldn't": "would not",
                   "wouldn't've": "would not have",
                   "y'all": "you all",
                   "y'all'd": "you all would",
                   "y'all'd've": "you all would have",
                   "y'all're": "you all are",
                   "y'all've": "you all have",
                   "you'd": "you would",
                   "you'd've": "you would have",
                   "you'll": "you will",
                   "you'll've": "you will have",
                   "you're": "you are",
                   "you've": "you have"}

In [16]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [33]:
expand_contractions("  It's an amazing language which can be used for Scripting")

'  It is an amazing language which can be used for Scripting'

Removing Special Characters

In [21]:
def strip_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

Lemmatizing Text

In [37]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()

In [26]:
def lemmatize_text(text):
    tokens = nltk.word_tokenize (text)
    text =' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return text

Removing Stopwords

In [24]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [25]:
def strip_stopwords(text, is_lower_case=False):
    tokens = nltk.word_tokenize (text)
    #tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

Cleaned Text

In [34]:
def clean_text(text,strip_html=True, expand_contraction=True,
               accent_remove=True, text_lower_case=True,text_lemmatize=True, 
               special_char_remove=True, stopword_remove=True):
    
    processed_text=[]
    for doc in text:
        
        #HTML tah striping
        if strip_html:
            doc=strip_html_tag(doc)
        
        ## remove accented characters
        if accent_remove:
            doc = strip_accents(doc)
            
        # expand contractions    
        if expand_contraction:
            doc = expand_contractions(doc)
            
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
            
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        
        # lemmatizing text
        if text_lemmatize:
            doc = lemmatize_text(doc)
        
        # remove special characters    
        if special_char_remove:
            doc = strip_special_characters(doc)  
        
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        
        # remove stopwords
        if stopword_remove:
            doc = strip_stopwords(doc, is_lower_case=text_lower_case)
            
        processed_text.append(doc)
        
    return processed_text


In [29]:
document = """<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n 
              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n
              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n
              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n
              got a headstart!</p>
           """
document

"<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n \n              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n\n              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n\n              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n\n              got a headstart!</p>\n           "

In [35]:
clean_text([document], text_lemmatize=False, stopword_remove=False, text_lower_case=False)

['Hello Hello can you hear me I just heard about Python It is an amazing language which can be used for Scripting Web development Information Retrieval Natural Language Processing Machine Learning Artificial Intelligence What are you waiting for Go and get started He is learning she is learning they have already got a headstart ']

In [38]:
clean_text([document])

['hello hello hear heard python amazing language used scripting web development information retrieval natural language processing machine learning artificial intelligence waiting go get started learning learning already got headstart']