In [None]:
# https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk

# Text Analysis Operations using NLTK

NLTK is a powerful Python package that provides a set of diverse natural languages algorithms. 

It is free, opensource, easy to use, large community, and well documented. NLTK consists of the most common algorithms such as tokenizing, part-of-speech tagging, stemming, sentiment analysis, topic segmentation, and named entity recognition. 

NLTK helps the computer to analysis, preprocess, and understand the written text.

In [23]:
import nltk
import numpy as np # linear algebra
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
nltk.download("stopwords")
nltk.download('punkt')
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from keras.utils import to_categorical

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\floPe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\floPe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
Using TensorFlow backend.


# Tokenization

Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentence is called Tokenization. 

Token is a single entity that is building blocks for sentence or paragraph.

### Sentence Tokenization

Sentence tokenizer breaks paragraph text into sentences.

In [24]:
df = pd.read_csv("train.csv")
# Drop the null columns where all values are null
# Drop the null rows
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [25]:
df_sub = df[['text', 'target']]
df_sub.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [33]:
train = df.filter(["id", "keyword", "text", "target"], axis=1)

In [34]:
def tolkencleaner(df, dfdirty):
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    ps = PorterStemmer()
    tolkenized = []
    for tweet in dfdirty:
        word_tokens = word_tokenize(tweet)
        stripped = [ps.stem(w) for w in word_tokens if not w in punctuation]
        split_sentence = [w for w in stripped if not w in stop_words]
        filtered_sentence = [w.lower() for w in split_sentence]
        tolkenized.append(list(filtered_sentence))
        df['tolkenized'] = tolkenized

In [35]:
tolkencleaner(train, train.text)
train.head()

ValueError: Length of values does not match length of index

### Word Tokenization

Word tokenizer breaks paragraph text into words.

In [19]:
from nltk.tokenize import word_tokenize
def substrings_in_string(df_sub, substrings):
    for substring in substrings:
        tokenized_word=word_tokenize(df_sub.text)
    print(tokenized_word)


In [21]:
tokenized_word

NameError: name 'tokenized_word' is not defined

### Word Cleaning

In most cases, removing punctuation and converting words to lower-case allow for a more robust analysis

In [None]:
import string
def clean_text(text):
    return text.translate(str.maketrans('', '', string.punctuation)).lower()

cleanText = clean_text(text)
print(cleanText)

In [None]:
tokenized_word=word_tokenize(cleanText)
print(tokenized_word)

# Frequency Distribution

Once tokenized, we can determine word frequencies

In [None]:
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
print(fdist)

In [None]:
fdist.most_common(2)

In [None]:
# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)


# Stopwords

Stopwords are considered noise in the text. Text may contain "stopwords" such as is, am, are, this, a, an, the, etc.

In NLTK, to remove stopwords, you need to create a list of stopwords and filter out your list of tokens from these words.

In [None]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

In [None]:
print("The number of stop words provided by NLTK is: " + str(len(stop_words)))

In [None]:
filtered_sent=[]
for w in tokenized_word:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Text:",tokenized_word)
print("\n")
print("Filterd Text:",filtered_sent)

# Text Normalization

Text normalization considers another type of noise in the text. 

For example, *connection*, *connected*, and *connecting* all reduce to the common word *connect*. Normalization reduces  related forms of a word to a common root word. This can be done via Stemming or Lemmatization.

### Stemming

Stemming is a process of linguistic normalization, which reduces words to their word root word or chops off the derivational affixes. For example, the stem of *connection*, *connected*, and *connecting* is "connect".

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
test_sentence = "Cooking, eating, and walking are things Vinesh is currently doing. Yesterday, he cooked, ate, and walked."

cleanSentence = clean_text(test_sentence)
tokenTest = word_tokenize(cleanSentence)

stemmed_words=[]
for w in tokenTest:
    stemmed_words.append(ps.stem(w))
    
print("Stemmed Sentence:", stemmed_words)

In [None]:
fdist2 = FreqDist(stemmed_words)
print(fdist2.most_common(5))

In [None]:
# notice that eating and ate were not normalized.

### Lemmatization

Lemmatization reduces words to their base word, which is linguistically correct lemmas. 

It transforms to root word with the use of vocabulary and morphological analysis. Lemmatization is usually more sophisticated than stemming. While stemming simply chops off the "-ing" or "-ed", lemmatization essentially performs a dictionary look-up.

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

test_sentence = "Cooking, eating, and walking are things Vinesh is currently doing. Yesterday, he cooked, ate, and walked."
cleanSentence = clean_text(test_sentence)
tokenTest = word_tokenize(cleanSentence)

lemma_words=[]
for w in tokenTest:
    lemma_words.append(lem.lemmatize(w,"v"))
    
print("Lemmatized Sentence:", lemma_words)

In [None]:
fdist3 = FreqDist(lemma_words)
print(fdist3.most_common(5))

### POS Tagging

The primary objective of Part-of-Speech (POS) tagging is to identify the grammatical group of a given word, for exampe, whether it is a *NOUN*, *PRONOUN*, *ADJECTIVE*, *VERB*, *ADVERBS*, etc. based on the context. 

POS Tagging looks for relationships within the sentence and assigns a corresponding tag to the word.

In [None]:
pos_sentence = "Albert Einstein was born in Ulm, Germany in 1879."

# Need to keep capitalization for proper nouns
cleanSentence = clean_text(pos_sentence)
tokenTest = word_tokenize(cleanSentence)
nltk.pos_tag(tokenTest)

### POS tag list:

- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: "there is" ... think of it like "there exists")
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective 'big'
- JJR adjective, comparative 'bigger'
- JJS adjective, superlative 'biggest'
- LS list marker 1)
- MD modal could, will
- NN noun, singular 'desk'
- NNS noun plural 'desks'
- NNP proper noun, singular 'Harrison'
- NNPS proper noun, plural 'Americans'
- PDT predeterminer 'all the kids'
- POS possessive ending parent's
- PRP personal pronoun I, he, she
- PRPs possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO to go 'to' the store.
- UH interjection errrrrrrrm
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WPs possessive wh-pronoun whose
- WRB wh-abverb where, when

# Feature Generation

To train a model, we need to convert strings of text to numbers. Various ways to do this include word counts, tf^idf, and sentiment analysis.

### Feature Generation using Word Counts
First, one can create a matrix of document and words by counting the occurrence of words in the given document. This matrix is known as Document-Term Matrix (DTM).

In [None]:
phrase1 = "I have passions and love for all dogs." 
phrase2 = "I hate this dog and that dog and passions for hobbies." 
phrase3 = "Knitting is my hobby and passion."

phrases = [phrase1, phrase2, phrase3]

In [None]:
lemmaMatrix = []

for sentence in phrases:
    lemma_words=[]
    cleanSentence = clean_text(sentence)
    tokenTest = word_tokenize(cleanSentence)
    
    for word in tokenTest:
        #check stop words
        if word not in stop_words:      
            #get part of speech
            pos = nltk.pos_tag([word])[0][1][0].lower()
            if pos == "n": #noun lemma
                lemma_words.append(lem.lemmatize(word,"n"))
            else: #verb lemma
                lemma_words.append(lem.lemmatize(word,"v"))
        
    lemmaMatrix.append(lemma_words)

#Rejoin strings and print
rebuiltLemmas = [" ".join(x) for x in lemmaMatrix]
print(rebuiltLemmas)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

cv = CountVectorizer(stop_words='english')
text_counts= cv.fit_transform(rebuiltLemmas)

In [None]:
print(cv.get_feature_names())
print(text_counts.toarray())

In [None]:
#Turn into dataframe
dtm = pd.DataFrame(text_counts.toarray(), columns=cv.get_feature_names())
dtm

### Feature Generation using TF-IDF

TF-IDF(Term Frequency-Inverse Document Frequency) normalizes the document term matrix. It is the product of TF and IDF. Words with high tf-idf in a document, must occur in that specific document and must be absent in the other documents. This implies that words must be a *signature* word.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf=TfidfVectorizer()
text_tf= tf.fit_transform(rebuiltLemmas)

In [None]:
tf_df = pd.DataFrame(text_tf.toarray(), columns=tf.get_feature_names())
tf_df

### Feature Generation using Sentiment Analysis

In [None]:
from textblob import TextBlob

sentiments = []
for phrase in rebuiltLemmas:
    sentiments.append(TextBlob(phrase).sentiment.polarity)
    
tf_df["Sentiment"] = sentiments

In [None]:
tf_df

In [None]:
rebuiltLemmas