In [1]:
from newspaper import Article
from random import shuffle
import re
import matplotlib.pyplot as plt 
from collections import Counter

import numpy as np
import json
import glob

#gensim
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary  # Import the Dictionary class from Gensim

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  from .autonotebook import tqdm as notebook_tqdm


# Web Scraping

In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [3]:
stopwords = stopwords.words("english")

In [4]:
def most_repeated_phrase_count(text):
    # Split the text into phrases (e.g., sentences)
    phrases = re.split(r'\.', text)

    # Remove leading and trailing spaces from each phrase
    phrases = [phrase.strip() for phrase in phrases if phrase.strip()]

    # Count the occurrences of each phrase using Counter
    phrase_counts = Counter(phrases)

    if not phrase_counts:
        return 0

    # Find the most common phrase and its count
    most_common_phrase, count = phrase_counts.most_common(1)[0]

    return count

def filter_scrape_data(text):
    if len(text) < 1500 or most_repeated_phrase_count(text) >= 3:
        return False
    return True
    
def filter_social(url): # Filters social platforms that can't be scraped
    social_starts_with = ["https://www.youtube.com", "https://youtu.be", "https://www.facebook.com", "https://twitter.com"]
    for y in social_starts_with:
        if(url.startswith(y)):
            return False
    return True

def scrapeData(url):
    try:
        if filter_social(url):
            article = Article(url)
            article.download()
            article.parse()
            
            page_text = (article.text).lower()
            page_text = page_text.strip().replace("  ","")
            page_text = "".join([s for s in page_text.splitlines(True) if s.strip("\r\n")])
            
        else:
            page_text = "PARERROR: SocialError"
    except:
        page_text = "PARERROR: ErrorCouldntParse"
    return page_text

## Raw Text Data

In [5]:
# Store data in json file
rawData = []
for url in open("./data/urls.csv"):
    scrapedData = scrapeData(url)
    if filter_scrape_data(scrapedData):
        rawData.append(scrapedData)
write_data("./data/rawData.json", rawData)

# Pre Processing

## Lemmatize Data

In [12]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "AV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    text_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        text_out.append(final)
    return text_out

filteredData = load_data("./data/rawData.json")
lemmatizedData = lemmatization(filteredData)
write_data("./data/lemmatizedData.json", lemmatizedData)

## Remove Stop Words

In [30]:
def preprocess_article(input_text):
    sentences_ted = []
    
    # Use regular expression to split the text into words
    sentences_ted = re.findall(r'\b\w+\b', input_text)
    sentences_ted = [token for token in sentences_ted if not token.isdigit()]


    # Load stopwords from a file into a set
    stoplist = set()
    with open('stopwords') as openfileobject: 
        for line in openfileobject:
            stoplist.add(line.strip())  # Use strip() to remove leading/trailing whitespace
    
    cleaned_text = " ".join(word for word in sentences_ted if word not in stoplist)

    return cleaned_text

lemmatizedData = load_data("./data/lemmatizedData.json")
filteredData = [preprocess_article(x) for x in lemmatizedData]
write_data("./data/filteredData.json", filteredData)

## Split Data to Words

In [31]:
def gen_words(texts):
    final = [gensim.utils.simple_preprocess(text, deacc=True) for text in texts]
    return final
filteredData = load_data("./data/filteredData.json")
tokenizedData = gen_words(filteredData)
write_data("./data/tokenizedData.json", tokenizedData)

## Vectorize Data

In [41]:
tokenizedData = load_data("./data/tokenizedData.json")
id2word = corpora.Dictionary(tokenizedData)
id2word.filter_extremes(no_below=0.1, no_above=0.9)

corpus = [id2word.doc2bow(text) for text in tokenizedData]
write_data("./data/vectorizedData.json", corpus)

In [42]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 2), (10, 2), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 3), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 3), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 4), (52, 1), (53, 1), (54, 1), (55, 2), (56, 1), (57, 2), (58, 1), (59, 1), (60, 1), (61, 3), (62, 1), (63, 1), (64, 1), (65, 5), (66, 1), (67, 1), (68, 1), (69, 2), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 2), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 2), (88, 2), (89, 3), (90, 1), (91, 1), (92, 2), (93, 1), (94, 1), (95, 1), (96, 1), (97, 3), (98, 1), (99, 1), (100, 3), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 2), (108, 1), (109, 1), (110, 2)

# LDA Topic Modeling

In [45]:
def compute_coherence_values(dictionary, corpus, filteredData, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
        #model = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = num_topics, id2word=dictionary,random_state=100,passes=10)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, texts=filteredData, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
limit = 100; start=5; step=5;
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, filteredData, start=start, limit=limit, step=step)
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1952168685.py, line 8)

In [69]:
abs_coherence_values = [abs(x) for x in coherence_values]
best_result_index = abs_coherence_values.index(min(abs_coherence_values))
optimal_model = model_list[best_result_index]
# Select the model and print the topics
model_topics = optimal_model.show_topics(formatted=False)
print(f'''The {x[best_result_index]} topics gives the highest coherence score 
of {coherence_values[best_result_index]}''')

The 50 topics gives the highest coherence score 
of -1.5022958771375263
9


In [None]:
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)
p

In [None]:
def topics_per_article(corpus, ldamodel):
    docs_per_topic = [[] for _ in range(len(corpus))]
    for doc_id, doc_bow in enumerate(corpus):
        doc_topics = ldamodel.get_document_topics(doc_bow)
        for topic_id, score in doc_topics:
            docs_per_topic[doc_id].append((topic_id, score))
    return docs_per_topic

rawData_filtered = load_data("rawData_filtered.json")
docs_per_topic = topics_per_article(corpus, optimal_model)


        

In [100]:
def topic_reviewer(corpus, ldamodel, text):
    topic_review = []
    urls = load_data("filtered_urls.json")
    
    for doc_id, doc_bow in enumerate(corpus):
        topic_data = []
        topic_data.append(urls[doc_id])
        topic_data.append(text[doc_id])

        for topic_id, score in ldamodel.get_document_topics(doc_bow):
            topic_words = ldamodel.print_topic(topic_id)
            topic_data.append((topic_words, str(score)))
        
        topic_review.append(topic_data)

    return topic_review
topic_review = topic_reviewer(corpus, optimal_model, rawData_filtered)
write_data("document_topics.json", topic_review)

In [33]:
error_count = 0
f = open("topic_data.txt", "w")
for count, url in enumerate(urls):
    try:
        doc = get_doc(url)  # Load the document
        article = preprocess_article(doc)  # Preprocess and convert to bag-of-words
        bow_doc = ([dictionary.doc2bow(sentence) for sentence in article])
        topic_distribution = optimal_model.get_document_topics(bow_doc)  # Get the topic distribution
        for topic in topic_distribution[0]:
            topic_words = optimal_model.show_topic(topic[0])
            f.write(str(topic_words))
            f.write("\n")
        f.write(str(url))
        f.write("\n")
        f.write(str(doc))
        f.write("\n\n\n")
    except:
        error_count +=1

print("errors: ", error_count)



errors:  85


In [34]:
for count, url in enumerate(urls):
    try:
        doc = get_doc(url)  # Load the document
        article = preprocess_article(doc)  # Preprocess and convert to bag-of-words
        bow_doc = ([dictionary.doc2bow(sentence) for sentence in article])
        topic_distribution = optimal_model.get_document_topics(bow_doc)  # Get the topic distribution
        for topic in topic_distribution:
            topic_words = optimal_model.show_topic(topic[0])
        print(topic_words)
        print(doc)
        print("\n\n\n")
    except:
        pass



In [None]:
def weighted_word_sentiment(word,weight, doc):
    sentiment = sentence_level_sentiment_of_word(word, weight, doc)
    weighted_sentiment = sentiment * weight
    return weighted_sentiment