# PART 2

### Importing required libraries for part 2

In [None]:
import os
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tag import StanfordPOSTagger
from wordcloud import WordCloud
from pymongo import MongoClient

### Getting texts from MongoDB 

In [None]:
# Creating a connection to MongoDB
client = MongoClient("localhost", 27017)
db = client["news"]
collection = db["elespectador"]

In [None]:
text = []
for news in list(collection.find({}, {"title": 1, "full_text": 1, "_id": 0})):
    text.append(news["title"])
    text.append(news["full_text"])

In [None]:
text = " ".join(text)

In [None]:
text

### Lowercasing

In [None]:
text = text.lower()

### Tokenization

In [None]:
sentences = sent_tokenize(text)
print(sentences)

In [None]:
words = word_tokenize(text)
print(words)

### Removing punctuation

In [None]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)
words

### Words distribution - distribution frequency

In [None]:
frec_dist = FreqDist(words)
print(frec_dist)

In [None]:
# 5 most common words
frec_dist.most_common(5)

In [None]:
# Visualization of word distribution
n = 26
plt.figure(figsize = (30,10))
plt.bar([ w[0] for w in frec_dist.most_common(n) ], [ w[1] for w in frec_dist.most_common(n) ])

In [None]:
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = "white").generate(text)
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

### Stopwords

In [None]:
stop_words = set(stopwords.words("spanish"))
print(stop_words)

In [None]:
filtered_words = []
for w in words:
    if w not in stop_words:
        filtered_words.append(w)

print("All words:", words)
print("\n")
print("Substracting stopwords:", filtered_words)

In [None]:
frec_dist = FreqDist(filtered_words)
print(frec_dist)

In [None]:
frec_dist.most_common(5) 

In [None]:
n = 26
plt.figure(figsize = (30,10))
plt.bar([ w[0] for w in frec_dist.most_common(n) ], [ w[1] for w in frec_dist.most_common(n) ])

In [None]:
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = "white").generate(" ".join(filtered_words))

plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

### Part of Speech
https://nlp.stanford.edu/software/

In [None]:
jar = "./pos-tagger/stanford-postagger/stanford-postagger-4.2.0.jar"
model = "./pos-tagger/stanford-postagger/models/spanish-ud.tagger"

In [None]:
java_path = "C:\Program Files/Java/jdk1.8.0_271/bin/java"
os.environ["JAVAHOME"] = java_path

In [None]:
pos_tagger = StanfordPOSTagger(model, jar, encoding = "utf8")
pos_tagger.tag(words)