# Analyzing Textual Data and Social Media

# Import nltk

In [None]:
import nltk

# Filtering out stopwords, names, and numbers


## Load English stopwords and print some of the words

In [None]:
sw = set(nltk.corpus.stopwords.words('english'))
print("Stop words:", list(sw)[:7])

## Load Gutenberg corpopra and print some of the filenames

In [None]:
gb = nltk.corpus.gutenberg
print("Gutenberg files:\n", gb.fileids()[-5:])

## Extract sentences from milton-paradise.txt file

In [None]:
text_sent = gb.sents("milton-paradise.txt")[:2]
print("Unfiltered:", text_sent)

## Filter out the stopwords from extracted sentences

In [None]:
for sent in text_sent:
    filtered = [w for w in sent if w.lower() not in sw]
    print("Filtered:\n", filtered)
    tagged = nltk.pos_tag(filtered)
    print("Tagged:\n", tagged)

    words= []
    for word in tagged:
        if word[1] != 'NNP' and word[1] != 'CD':
           words.append(word[0]) 

    print("Words:\n",words)

# Bag of words model

## Import scikit-learn

In [None]:
import sklearn as sk

## Load two documents from NLTK Gutenberg corpus

In [None]:
hamlet = gb.raw("shakespeare-hamlet.txt")
macbeth = gb.raw("shakespeare-macbeth.txt")

## Create the feature vector by omitting English stopwords

In [None]:
cv = sk.feature_extraction.text.CountVectorizer(stop_words='english')
print("Feature vector:\n", cv.fit_transform([hamlet, macbeth]).toarray())

## Print a small selection of the features found

In [None]:
print("Features:\n", cv.get_feature_names()[:5])

# Analyzing word frequencies

In [None]:
import nltk
import string


gb = nltk.corpus.gutenberg
words = gb.words("shakespeare-caesar.txt")

sw = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)
filtered = [w.lower() for w in words if w.lower() not in sw and w.lower() not in punctuation]
fd = nltk.FreqDist(filtered)
print("Words", list(fd.keys())[:5])
print("Counts", list(fd.values())[:5])
print("Max", fd.max())
print("Count", fd['d'])

fd = nltk.FreqDist(nltk.bigrams(filtered))
print("Bigrams", list(fd.keys())[:5])
print("Counts", list(fd.values())[:5])
print("Bigram Max", fd.max())
print("Bigram count", fd[('let', 'vs')])

# Naive Bayesian

In [None]:
import nltk
import string
import random

sw = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)

def word_features(word):
   return {'len': len(word)}

def isStopword(word):
    return word in sw or word in punctuation
gb = nltk.corpus.gutenberg
words = gb.words("shakespeare-caesar.txt")

labeled_words = ([(word.lower(), isStopword(word.lower())) for 
word in words])
random.seed(42)
random.shuffle(labeled_words)
print(labeled_words[:5])

featuresets = [(word_features(n), word) for (n, word) in 
labeled_words]
cutoff = int(.9 * len(featuresets))
train_set, test_set = featuresets[:cutoff], featuresets[cutoff:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("'behold' class", classifier.classify(word_features('behold')))
print("'the' class", classifier.classify(word_features('the')))

print("Accuracy", nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(5))

# Sentiment Analysis

In [None]:
import random
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
import string

labeled_docs = [(list(movie_reviews.words(fid)), cat)
        for cat in movie_reviews.categories()
        for fid in movie_reviews.fileids(cat)]
random.seed(42)
random.shuffle(labeled_docs)

review_words = movie_reviews.words()
print("# Review Words", len(review_words))

sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def isStopWord(word):
    return word in sw or word in punctuation

filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]
print("# After filter", len(filtered))
words = FreqDist(filtered)
N = int(.05 * len(words.keys()))
word_features = list(words.keys())[:N]

def doc_features(doc):
    doc_words = FreqDist(w for w in doc if not isStopWord(w))
    features = {}
    for word in word_features:
        features['count (%s)' % word] = (doc_words.get(word, 0))
    return features

featuresets = [(doc_features(d), c) for (d,c) in labeled_docs]
train_set, test_set = featuresets[200:], featuresets[:200]
classifier = NaiveBayesClassifier.train(train_set)
print("Accuracy", accuracy(classifier, test_set))

print(classifier.show_most_informative_features())

# Creating Word Clouds

In [None]:
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import FreqDist
import string

sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def isStopWord(word):
    return word in sw or word in punctuation
review_words = movie_reviews.words()
filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]

words = FreqDist(filtered)
N = int(.01 * len(words.keys()))
tags = list(words.keys())[:N]

for tag in tags:
    print(tag, ':', words[tag])

In [None]:
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
import pandas as pd
import numpy as np
import string

sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)
all_names = set([name.lower() for name in names.words()])

def isStopWord(word):
    return (word in sw or word in punctuation) or not word.isalpha() or word in all_names

review_words = movie_reviews.words()
filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]

words = FreqDist(filtered)

texts = []

for fid in movie_reviews.fileids():
    texts.append(" ".join([w.lower() for w in movie_reviews.words(fid) if not isStopWord(w.lower()) and words[w.lower()] > 1]))

vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(texts)
sums = np.array(matrix.sum(axis=0)).ravel()

ranks = []

for word, val in zip(vectorizer.get_feature_names(), sums):
    ranks.append((word, val))

df = pd.DataFrame(ranks, columns=["term", "tfidf"])
df = df.sort_values(['tfidf'])
print(df.head())

N = int(.01 * len(df))
df = df.tail(N)

for term, tfidf in zip(df["term"].values, df["tfidf"].values):
    print(term, ":", tfidf)

# Social Network Analysis

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
print([s for s in dir(nx) if s.endswith('graph')])
G = nx.davis_southern_women_graph()
plt.hist(list(nx.degree(G).values()))
plt.show()

In [None]:
plt.figure(figsize=(8,8))
pos = nx.spring_layout(G)
nx.draw(G, node_size=10)
nx.draw_networkx_labels(G, pos)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
pos = nx.spring_layout(G, node_size=2)
nx.draw(G)
nx.draw_networkx_labels(G, pos)
plt.show()