# Text Analysis

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import lxml
import re
from collections import Counter
import nltk
import string
import matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer

- I uploaded a sample news story about American Express. Let's do some text analysis on it.

In [None]:
with open("amex_news.txt") as file:
    amexnews=file.read()
amexnews

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Stopwords are words like a, an, the
stopwords= set(nltk.corpus.stopwords.words('english'))
stopwords

In [None]:
string.punctuation

In [None]:
stopwords=stopwords.union(set(string.punctuation))
stopwords

In [None]:
sample='This is a sample sentence, which will illustrate how tokenization and stop words filtration works. This is another sentence. Profits went up by 3.4% this quarter. We will tokenize all this. And more.'

In [None]:
sample.split()

In [None]:
samplewords=nltk.tokenize.word_tokenize(sample)
samplewords

In [None]:
#Number of sentences in our sample string
len(nltk.tokenize.sent_tokenize(sample))

In [None]:
nonstopwords=[]
for word in samplewords:
    if not(word.lower() in stopwords):
        nonstopwords.append(word)
nonstopwords

In [None]:
nonstop_words= [word.lower() for word in samplewords if not(word.lower() in stopwords)]
nonstop_words

- Let's say you want to count the frequency of words in a document
- You probably want to treat different cases of the same word as the same (for example: words like "dogs" and "dog" should probably be thought of as one "word" for the purpose of word frequencies
- You can do this with "stemming" or "lemmatizing"

In [None]:
# Lemmatize words
lemma = nltk.WordNetLemmatizer()

In [None]:
lemma.lemmatize('dead')

- As you can see Lemmatizing words is not perfect as "died" did not get converted to "die". There are advanced techniques that require giving the Lemmatizer the part-of-speech of the word to make for better lemmatizing. We will ignore this here...

In [None]:
'''
1. convert text to lowercase
2. tokenize into words
3. remove stopwords and non-ABC characters
4. lemmatize those stopwords
'''
amexwords=nltk.tokenize.word_tokenize(amexnews.lower())
amex_ns_words=[word for word in amexwords if (word not in stopwords) and word.isalpha()]
amex_lemma_words=[lemma.lemmatize(word) for word in amex_ns_words]
amex_lemma_words

In [None]:
amex_freq=nltk.FreqDist(amex_lemma_words)
amex_freq.most_common()

In [None]:
#Get a built-in sentiment score for each sentence in our Amex article
sentiment=SentimentIntensityAnalyzer()
amex_sentences=nltk.tokenize.sent_tokenize(amexnews)
#s=sentiment.polarity_scores(amex_sentences[0])

comp_scores=[] #store compound scores
netpos_scores=[] #store pos - neg
for s in amex_sentences:
    comp_scores.append(sentiment.polarity_scores(s)['compound'])
    pos=sentiment.polarity_scores(s)['pos']
    neg=sentiment.polarity_scores(s)['neg']
    netpos_scores.append(pos-neg)
avgcomp=np.array(comp_scores).mean()
avgnet=np.array(netpos_scores).mean()
print("Average compound score is: ", avgcomp, " Avg net pos is: ", avgnet)

# Bag of Words
- We discard the structures like paragraphs, sentences etc
- We only count how often each word appears
- Tokenization
    - Split the document into words
- Vocabulary Build
    - Build a vocabulary of words that appear in all documents
- Encoding
    - For each document count the number of times a word occurs

### Calculate the word frequencies of Apple's 10K

In [None]:
with open('Apple 10-K 2017.txt') as file:
    aapltext=file.read().lower()
souptext=BeautifulSoup(aapltext).text #removes all the html tags

1. Convert to lower case
2. Tokenize into words
3. Remove stop words
4. Remove all non "ABC" characters
5. Lemmatize

In [None]:
aaplwords=nltk.tokenize.word_tokenize(souptext.lower())
aapl_ns_words=[word for word in aaplwords if (word not in stopwords) and word.isalpha()]
aapl_lemma_words=[lemma.lemmatize(word) for word in aapl_ns_words]
aapl_lemma_words

### Use the LM dictionaries to calculate the percent of positive words

In [None]:
with open('LM_Positive.txt') as file:
    LM_Positive=file.read().lower()
with open('LM_Negative.txt') as file:
    LM_Negative=file.read().lower()


In [None]:
positive_bag=nltk.tokenize.word_tokenize(LM_Positive)
negative_bag=nltk.tokenize.word_tokenize(LM_Negative)

In [None]:
#Keep positive words in our AAPL 10k
aapl_poswords=[word for word in aapl_lemma_words if word in positive_bag]

#Keep negative words in our AAPL 10k
aapl_negwords=[word for word in aapl_lemma_words if word in negative_bag]

### Document Tone
- Defined as %Positive - %Negative

In [None]:
pctpos=len(aapl_poswords)/len(aapl_lemma_words)
pctneg=len(aapl_negwords)/len(aapl_lemma_words)
tone=pctpos-pctneg
tone

In [None]:
aapl_negwords

### Calculate FOG Index

In [None]:
from nltk.corpus import cmudict
nltk.download('cmudict')
d=cmudict.dict()

In [None]:
d['economy']

In [None]:
d['photograph']

In [None]:
d['finance']

In [None]:
d['vineet']

- Write a function to count the number of syllables per word using the stress markers from the CMU dictionary
- For now, just take the first pronounciation (if there are multiple)

In [None]:
def numsyllables(word, d):
    #receives a string "word"
    #receives a dictionary "d"
    #return the number of syllables in that word
    try:
        return len([y for y in d[word][0] if y[-1].isdigit()])
    except:
        return -999
    

In [None]:
numsyllables('tomato', d)

- Calculate percent of complex words

In [None]:
def FOGindex(doctext, d):
    #receives a long string of the document text
    #receives a CMUdictionary of number of syllables
    #returns FOG Index = .4*(avg number of words per sent + % complex words)
    
    numsentences=len(nltk.tokenize.sent_tokenize(doctext))
    
    #Total number of words
    docwords=nltk.tokenize.word_tokenize(doctext.lower())
    ns_words=[word for word in docwords if (word not in stopwords) and word.isalpha()]
    lemma_words=[lemma.lemmatize(word) for word in ns_words]
    totwords=len(lemma_words)
    
    #Total number of complex words
    complex_words=[word for word in lemma_words if numsyllables(word, d)>2]
    num_complex=len(complex_words)
    
    FOG=.4*((totwords/numsentences)+(num_complex/totwords))
    return FOG, totwords, numsentences, num_complex

In [None]:
FOGindex(souptext,d)

In [None]:
FOGindex(amexnews,d)