## Text wrangling and cleansing

In [2]:
import nltk

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/prajwalluitel/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/prajwalluitel/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/prajwalluitel/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/prajwalluitel/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/prajwalluitel/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |  

KeyboardInterrupt: 

In [20]:
sample_string = "This is a simple string. Let us consider this to be a paragraph. It's a nice day as the weather is warm. I look forward to tomorrow. Mr. Ralph has called me tomorrow."

from nltk.tokenize import sent_tokenize

tokenized_string = sent_tokenize(sample_string)
print(f"The tokenized string is: \n{tokenized_string}")

The tokenized string is: 
['This is a simple string.', 'Let us consider this to be a paragraph.', "It's a nice day as the weather is warm.", 'I look forward to tomorrow.', 'Mr. Ralph has called me tomorrow.']


### Write a custom solution to do the same job without using the sent_tokenize.

In [29]:
words_with_period = [
    "Mr.",
    "Mrs.",
    "Ms.",
    "Dr.",
    "Prof.",
    "Jr.",
    "Sr.",
    "St.",
    "Co.",
    "Inc.",
    "Ltd.",
    "etc.",
    "i.e.",
    "eg.",
    "am.",
    "pm.",
    "US.",
    "UK.",
    "UN.",
    "EU.",
    "AD.",
    "BC.",
    "PhD.",
    "MD.",
    "BA.",
    "MA.",
    "DDS.",
    "PO.",
    "PS."
]

splitted = sample_string.split(".")

# To revert the known words with . that have been splitted
for i in range(len(splitted)-1):
    if (splitted[i]+".").strip() in words_with_period:
        splitted[i+1] = splitted[i]+splitted[i+1]
        del splitted[i]
    
    # to remove any extra thing in the list which is blank
    if splitted[i].strip() == "":
        del splitted[i]


print(splitted)

['This is a simple string', ' Let us consider this to be a paragraph', " It's a nice day as the weather is warm", ' I look forward to tomorrow', ' Mr Ralph has called me tomorrow']


#### Cleaning text

In [6]:
text_data = ["      Hello world!!!!     ", "This     is    an example to show strip     ", "Another example      "]

strip_whitespace = [text.strip() for text in text_data]
strip_whitespace

['Hello world!!!!',
 'This     is    an example to show strip',
 'Another example']

In [14]:
text_data = ["      Hello world!!!!     ", "This is an example to show strip     ", "Another example      "]

strip_whitespace = [text.strip() for text in text_data]
strip_whitespace

['Hello world!!!!', 'This is an example to show strip', 'Another example']

In [15]:
remove_periods = [string.replace(".","") for string in strip_whitespace]

In [16]:
remove_periods

['Hello world!!!!', 'This is an example to show strip', 'Another example']

In [17]:
def capitalizer(string:str)->str:
    return string.upper()

In [18]:
[capitalizer(string) for string in remove_periods]

['HELLO WORLD!!!!', 'THIS IS AN EXAMPLE TO SHOW STRIP', 'ANOTHER EXAMPLE']

## Word tokenizing

In [35]:
# Use of split is not smart and accurate

msg = "Hey everyone! The party starts in 10mins. Be there ASAP!"
print(msg.split())


['Hey', 'everyone!', 'The', 'party', 'starts', 'in', '10mins.', 'Be', 'there', 'ASAP!']


In [36]:
# Use of split is not smart and accurate

msg2 = "Hey/everyone/!/The/party/starts/in/10mins/./Be/there/ASAP/!"
print(msg.split("/"))


['Hey everyone! The party starts in 10mins. Be there ASAP!']


In [40]:
joined_string = " ".join(msg2.split("/"))
print(joined_string)

Hey everyone ! The party starts in 10mins . Be there ASAP !


In [37]:
# Use of nltk is more accurate to what we want

from nltk.tokenize import word_tokenize

tokenized_words = word_tokenize(msg)
print(tokenized_words)

['Hey', 'everyone', '!', 'The', 'party', 'starts', 'in', '10mins', '.', 'Be', 'there', 'ASAP', '!']


## Regular expressions

In [41]:
import re

In [42]:
def replace_letters_with_X(string:str)->str:
    return re.sub(r"[a-zA-Z]", "X", string)

In [43]:
[replace_letters_with_X(text) for text in remove_periods]

['XXXXX XXXXX!!!!', 'XXXX XX XX XXXXXXX XX XXXX XXXXX', 'XXXXXXX XXXXXXX']

## Stemming and Lemmatization

###### Stemming is very simple which just chops off the part of the words

In [45]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()
porter.stem("Walking")

'walk'

In [46]:
porter.stem("Crying")

'cri'

In [47]:
porter.stem("Basically")

'basic'

In [48]:
porter.stem("Beautifully")

'beauti'

##### Lemmatization is pretty accurate as it explores the WordNet to return the root form of the word

In [49]:
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()
print(lem.lemmatize("running", pos='n'))
print(lem.lemmatize("running", pos='v'))

running
run


In [50]:
lem.lemmatize("Better", 'a')

'Better'

In [51]:
lem.lemmatize("Ate", "v")

'Ate'

## Stopwords removal

In [52]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [53]:
print(len(stopwords))

179


In [56]:
my_string = """Hello, this is a paragraph with many words. I want to create a list of words which are well processed and all the stopwords are removed. I am studying Artificial intelligence and machine learning at Lambton college in toronto. I am in Canada and have been here since few months now."""

stopwords_removed = [word for word in word_tokenize(my_string.lower()) if word not in stopwords]
print(stopwords_removed)

['hello', ',', 'paragraph', 'many', 'words', '.', 'want', 'create', 'list', 'words', 'well', 'processed', 'stopwords', 'removed', '.', 'studying', 'artificial', 'intelligence', 'machine', 'learning', 'lambton', 'college', 'toronto', '.', 'canada', 'since', 'months', '.']


### Part of speech tags

In [57]:
text = "Can you please buy me an Arizona Ice Tea? It is $0.99"

text_pos_tags = nltk.pos_tag(word_tokenize(text))

In [58]:
text_pos_tags

[('Can', 'MD'),
 ('you', 'PRP'),
 ('please', 'VB'),
 ('buy', 'VB'),
 ('me', 'PRP'),
 ('an', 'DT'),
 ('Arizona', 'NNP'),
 ('Ice', 'NNP'),
 ('Tea', 'NNP'),
 ('?', '.'),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('$', '$'),
 ('0.99', 'CD')]

### A more detailed overview of regular expressions

The most important or mostly used are:

re.search(pattern, text)

re.match(pattern, text)

re.sub(pattern, substitute, text)

re.findall(pattern, text)

In [59]:
tweet = "Hello guys, this is my first #tweet, Check this @joy, #nlp #machine #learning"

re.findall("#", tweet)

['#', '#', '#', '#']

In [60]:
# if found, it returns a match object, else nothing
re.search("#", tweet)

<re.Match object; span=(29, 30), match='#'>

In [63]:
re.search("X", tweet)

In [64]:
# begins from the beginning of the string, returns match object if found, else nothing
re.match("Hello", tweet)

<re.Match object; span=(0, 5), match='Hello'>

In [66]:
new_tweet = re.sub("#", "_", tweet)
print(new_tweet)

Hello guys, this is my first _tweet, Check this @joy, _nlp _machine _learning


#### Bag of Words

In [None]:
#TODO Search: Recursive feature elimination

In [67]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


In [68]:
text_data = np.array(['I love Brazil. Brazil!', "Sweden is best", "Germany beats both"])
count= CountVectorizer()
bag_of_words = count.fit_transform(text_data)
bag_of_words

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [69]:
count.get_feature_names_out()

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

In [70]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]])

## Creating a dataframe

In [72]:
import pandas as pd

df = pd.DataFrame(data=bag_of_words.toarray(), columns=count.get_feature_names_out())
df.head()

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0
