In [9]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import gensim
import gensim.corpora as corpora
import nltk
import pyLDAvis
import pyLDAvis.gensim
import nltk
from nltk.corpus import wordnet, stopwords, words
from nltk.stem import WordNetLemmatizer

In [12]:
nltk.download('words')
stop_words = stopwords.words('english')

[nltk_data] Downloading package words to
[nltk_data]     /Users/thanawatthongpia/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [10]:
def preprocess_text(sentences):

    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    bigram = gensim.models.Phrases(sentences, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    data_words_nostops = [[word for word in doc if word not in stop_words] for doc in sentences]
    data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]

    for sentence in data_words_bigrams:
        lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet.VERB) for token in sentence]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

In [13]:
def import_text_to_list_of_lists(file_path, delimiter='\t'):
    result = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace and newlines
            sublist = line.split(delimiter)
            result.append(sublist)
    return result

In [11]:
def corpus_topic(corpus,num_topics=10):

    data_lemmatized = preprocess_text(corpus)
    id2word = corpora.Dictionary(data_lemmatized)  

    # Term Document Frequency 
    LDA_corpus = [id2word.doc2bow(text) for text in data_lemmatized]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=LDA_corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    
    # # Compute Perplexity
    # print('\nPerplexity: ', lda_model.log_perplexity(LDA_corpus))  
    # # a measure of how good the model is. lower the better.

    # # Compute Coherence Score
    # coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    # coherence_lda = coherence_model_lda.get_coherence()
    # print('Coherence Score: ', coherence_lda)
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, LDA_corpus, id2word)
    return vis

In [3]:
#Scraping links on news page with click load more button

driver = webdriver.Safari()
driver.get('https://scbtechx.io/news/')
wait_duration = 5

while True:
    try:
        # Find the button element by its class
        load_more_button = driver.find_element(By.CLASS_NAME, "wpr-load-more-btn")
        driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
        load_more_button.click()
        print("Clicked 'Load More' button")
        sleep(wait_duration)

    except ElementNotInteractableException:
        break

soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find("section", class_="wpr-grid elementor-clearfix grid-images-loaded")
a_tags = section.find_all("a")

keep_links = [link.get('href') for link in a_tags]
news_lisks = [link for link in set(keep_links) if link.startswith('https://scbtechx.io/') and 'https://scbtechx.io/category/' not in link]
driver.quit()

Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button


In [18]:
news_lisks[:10]

['https://scbtechx.io/news/pointxnews/news-scb-techx-pointx-launch/#respond',
 'https://scbtechx.io/news/pointxnews/enjoy-the-freedom-of-point-for-cash-redemption-with-the-pointx-new-year-giveaway/#respond',
 'https://scbtechx.io/news/others/news_scb_techx_internship_program_2023/',
 'https://scbtechx.io/news/others/news_scb_techx_internship_program_2023/#respond',
 'https://scbtechx.io/news/pointxnews/celebrate-the-festive-seasons-with-scan-pay-from-pointx/#respond',
 'https://scbtechx.io/news/leadershipnews/scb-techx-jonathan-sharp-cto/',
 'https://scbtechx.io/news/pointxnews/individuals-can-now-pay-their-income-tax-with-pointx-instead-of-cash/#respond',
 'https://scbtechx.io/news/others/track-interview-for-business-analyst-system-analyst/',
 'https://scbtechx.io/news/pointxnews/pointx-launches-extravagant-add-to-cart-at-x-store-campaign/#respond',
 'https://scbtechx.io/news/pointxnews/dear-coffee-lovers-dont-miss-pointxs-let-coffee-lovers-enjoy-coffee-every-day/']

In [17]:
techx_corpus = import_text_to_list_of_lists('src/Corpus_TechX.txt', delimiter='\t')
corpus_topic(preprocess_text(techx_corpus))


Perplexity:  -6.684810601212469
Coherence Score:  0.43309610510121493
