# Notebook for experimenting with text processing

In [1]:
# necessary to add root to the path to access core package
import sys
sys.path.append(sys.path[0] + '/..')
# print(sys.path)
import os
print(os.getcwd())

/Users/i333584/Projects/ds-experiments/sentiment_analysis_IMDB_reviews/notebooks


In [4]:
from typing import List, Dict

from IPython.display import Image

%load_ext autoreload
%autoreload 2
# Here add packages you develop

In [5]:
SAMPLE_TEXT = """
<p style='clear:both;'>Loris gypsum dolour sits is meat, dictum divides sentient per no sole. Requiem effendi and vies, sequel rill animal ea has, ad prod option integer ea. Use it mollies nostrum, per en appropriate negligent. Ala emus no duo, obi verger debus an. Ea elite ague disco bequeath eons. Gracie nominal set id xiv.</p>

<p style='clear:both;'>Pro id bonbon accustoms. Stet probates in duo. Set ponder um expedients cu, veil ex quid dictum momentum. Gracie nominal set id xiv. Eel cues linguist efficient ea, veil sale disciple at. Es drams vituperate it, amateur lucid lids ex mesh.</p>

<p style='clear:both;'>Brusque croquet pro ea. Deter ornate.</p>
"""

## Text processing using Regex and BeuatifulSoup

* [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)


In [9]:
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def clear_text(text):
    soup = BeautifulSoup(text)
    text = soup.get_text()
    text = text.replace('\n', ' ')
    text = REPLACE_NO_SPACE.sub("", text.lower())
    text = REPLACE_WITH_SPACE.sub(" ", text)
    return text

def split_text(text):
    return text.split()

print(clear_text(SAMPLE_TEXT))
# split_text(clear_text(SAMPLE_TEXT))

loris gypsum dolour sits is meat dictum divides sentient per no sole requiem effendi and vies sequel rill animal ea has ad prod option integer ea use it mollies nostrum per en appropriate negligent ala emus no duo obi verger debus an ea elite ague disco bequeath eons gracie nominal set id xiv pro id bonbon accustoms stet probates in duo set ponder um expedients cu veil ex quid dictum momentum gracie nominal set id xiv eel cues linguist efficient ea veil sale disciple at es drams vituperate it amateur lucid lids ex mesh brusque croquet pro ea deter ornate 


## Text processing using NLTK

In [10]:
import nltk
# nltk.download() # Uncomment only if you do not the data already downlaoded
# Check nltk data by uncommenting following lines
from nltk.corpus import brown
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [11]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [12]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def process_text(text):
    soup = BeautifulSoup(text)
    text = soup.get_text()
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in words]
    words = [word for word in words if not word in stop_words]
    return words

print(process_text(clear_text(SAMPLE_TEXT)))

['loris', 'gypsum', 'dolour', 'sit', 'meat', 'dictum', 'divide', 'sentient', 'per', 'sole', 'requiem', 'effendi', 'vie', 'sequel', 'rill', 'animal', 'ea', 'ad', 'prod', 'option', 'integer', 'ea', 'use', 'mollies', 'nostrum', 'per', 'en', 'appropriate', 'negligent', 'ala', 'emus', 'duo', 'obi', 'verger', 'debus', 'ea', 'elite', 'ague', 'disco', 'bequeath', 'eons', 'gracie', 'nominal', 'set', 'id', 'xiv', 'pro', 'id', 'bonbon', 'accustom', 'stet', 'probate', 'duo', 'set', 'ponder', 'um', 'expedients', 'cu', 'veil', 'ex', 'quid', 'dictum', 'momentum', 'gracie', 'nominal', 'set', 'id', 'xiv', 'eel', 'cue', 'linguist', 'efficient', 'ea', 'veil', 'sale', 'disciple', 'es', 'drams', 'vituperate', 'amateur', 'lucid', 'lids', 'ex', 'mesh', 'brusque', 'croquet', 'pro', 'ea', 'deter', 'ornate']


## [Sklearn text feature extraction](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text)

* [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer)
  Convert a collection of text documents to a matrix of token counts

```This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.```

  

In [13]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([clear_text(SAMPLE_TEXT)])
print(vectorizer.get_feature_names())

['accustoms', 'ad', 'ague', 'ala', 'amateur', 'an', 'and', 'animal', 'appropriate', 'at', 'bequeath', 'bonbon', 'brusque', 'croquet', 'cu', 'cues', 'debus', 'deter', 'dictum', 'disciple', 'disco', 'divides', 'dolour', 'drams', 'duo', 'ea', 'eel', 'effendi', 'efficient', 'elite', 'emus', 'en', 'eons', 'es', 'ex', 'expedients', 'gracie', 'gypsum', 'has', 'id', 'in', 'integer', 'is', 'it', 'lids', 'linguist', 'loris', 'lucid', 'meat', 'mesh', 'mollies', 'momentum', 'negligent', 'no', 'nominal', 'nostrum', 'obi', 'option', 'ornate', 'per', 'ponder', 'pro', 'probates', 'prod', 'quid', 'requiem', 'rill', 'sale', 'sentient', 'sequel', 'set', 'sits', 'sole', 'stet', 'um', 'use', 'veil', 'verger', 'vies', 'vituperate', 'xiv']


In [14]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([' '.join(process_text(clear_text(SAMPLE_TEXT)))])
print(vectorizer.get_feature_names())

['accustom', 'ad', 'ague', 'ala', 'amateur', 'animal', 'appropriate', 'bequeath', 'bonbon', 'brusque', 'croquet', 'cu', 'cue', 'debus', 'deter', 'dictum', 'disciple', 'disco', 'divide', 'dolour', 'drams', 'duo', 'ea', 'eel', 'effendi', 'efficient', 'elite', 'emus', 'en', 'eons', 'es', 'ex', 'expedients', 'gracie', 'gypsum', 'id', 'integer', 'lids', 'linguist', 'loris', 'lucid', 'meat', 'mesh', 'mollies', 'momentum', 'negligent', 'nominal', 'nostrum', 'obi', 'option', 'ornate', 'per', 'ponder', 'pro', 'probate', 'prod', 'quid', 'requiem', 'rill', 'sale', 'sentient', 'sequel', 'set', 'sit', 'sole', 'stet', 'um', 'use', 'veil', 'verger', 'vie', 'vituperate', 'xiv']


## Conclusion

Following text clearning function was moved to lib folder:

In [15]:
import re
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
LEMMATIZER = WordNetLemmatizer()
STEMMER = PorterStemmer()
ENGLISH_STOP_WORDS = set(stopwords.words("english"))

def clean_text_en(text):
    soup = BeautifulSoup(text)
    text = soup.get_text()
    text = text.replace('\n', ' ')
    text = REPLACE_NO_SPACE.sub("", text.lower())
    text = REPLACE_WITH_SPACE.sub(" ", text)
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in words]
    words = [word for word in words if not word in stop_words]
    return ' '.join(words)

In [16]:
clean_text_en(SAMPLE_TEXT)

'loris gypsum dolour sit meat dictum divide sentient per sole requiem effendi vie sequel rill animal ea ad prod option integer ea use mollies nostrum per en appropriate negligent ala emus duo obi verger debus ea elite ague disco bequeath eons gracie nominal set id xiv pro id bonbon accustom stet probate duo set ponder um expedients cu veil ex quid dictum momentum gracie nominal set id xiv eel cue linguist efficient ea veil sale disciple es drams vituperate amateur lucid lids ex mesh brusque croquet pro ea deter ornate'