In [None]:
# import necessary libraries
import nltk
import os

# Hardcoded the installation and dowloaded file

# Define where to store the NLTK data (within workspace for write permissions)
nltk_data_dir = os.path.expanduser("~/workspace/nltk_data")
nltk.data.path.append(nltk_data_dir)

# Download 'punkt' and 'punkt_tab' in the defined directory
nltk.download('punkt', download_dir=nltk_data_dir)



In [3]:
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import os 

# Create an array of token without any stopwords and lowercase
def createTokensWithoutStopwords(file_path):
    # Read the text from the text file
    document_path = file_path
    document = ""
    with open(document_path, mode='r', encoding='utf-8') as file:
        document = file.read()

    # Tokenizing text from the document
    tokenized_docs = [word for word in word_tokenize(document.lower()) if word.isalpha()]
    no_stop = [token for token in tokenized_docs if token not in stopwords.words('english')]

    return no_stop


# Counter object which is a dictionary with word and its count
def creatBagOfWord(tokenized_doc):

    # Instantiate the WordNetLemmatizer
    # This will find the base form of the word- 'cats': 'cat', 'running':'run'
    wordnet_lemmatizer = WordNetLemmatizer()

    # Lemmatize all the tokens into a new list: lemmatized
    lemmatized = [ wordnet_lemmatizer.lemmatize(t) for t in tokenized_doc]

    # Create the bag-of-words: bow
    bow = Counter(lemmatized)

    return bow


# Creating different text files into multi-dimensional array of tokens
def createTokensOfDifferentArticles(dir_path):
    tokenOfArticles = []
    filename = []
    files = os.listdir(dir_path)
    for file in files:
        file_path = dir_path + "/" + file
        tokens = createTokensWithoutStopwords(file_path)
        tokenOfArticles.append(tokens)
        filename.append(file)
    return (tokenOfArticles,filename)



In [4]:
from gensim.corpora.dictionary import Dictionary
# Create a corpus, a bag of word with integer IDs.

# Giving words integer IDs
def createDictionary(article_tokens):
    articles = article_tokens
    # create a dictionary out of the article
    dictionary = Dictionary(articles)
    
    return dictionary

# Create a coprus - a dictionary with integer IDs, and frequency counts
def createCorpus(dictionary_bow, article_tokens):
    corpus = [dictionary_bow.doc2bow(article) for article in article_tokens]
    return corpus

# Getting information of words from corpus
# the words need to be provided in lowercase
def getWordInfoFromCorpus(corpus_data, dictionary_data, word, filenames):
    word_id = dictionary_data.token2id.get(word)
    if word_id is None:
        print(f"'{word}' doesn't exist in the dictionary.")
        return
    
    found = False
    articleCount = 0
    fileCount = len(filenames)
    for corpus in corpus_data :
        # Trun tuple list into dict and look up the value of the word_id
        count = dict(corpus).get(word_id)
        if count:
            print(f"'{word}' appears {count} times in this '{filenames[articleCount]}'.")
            found = True
        articleCount+=1
        
    if not found:
        print(f"'{word}' doesn't appear in any document.")

In [5]:
# Build Term Frequency- inverse document freqeuncy 
# Determine the most important words in each document
# The shared word across the documents should be down-weighted 
from gensim.models.tfidfmodel import TfidfModel

# return type will tfidf object, has to call with corpus data
def create_Tfidf_data(corpus_data):
    tfidf_data = TfidfModel(corpus_data)
    return tfidf_data

# Extract the top words of the articles
def extractTopic(tfidf_corpus, dictionary_data, corpus_data):
    topicArticle = []
    for corpus in corpus_data:
        topic = []
        # sort the tfdif data, the calculated value in desecnding order
        top_words = sorted(tfidf_corpus[corpus], key= lambda x: x[1], reverse=True)
        # extract an appropirate amount of words- 3 words
        topWords = top_words[:5]

        for word_id, value in topWords:
            topic.append(dictionary_data[word_id])
        topicArticle.append(topic)

    return topicArticle

# Describe the topic of the particular articles
def topicIdentification(extracted_topic, articleNames):

    for count, article in enumerate(articleNames):
        # Turn the extract topics into single string
        topics = " ".join(extracted_topic[count])
        print(f"'{topics}' are the topic of {article}.")
        count+=1
    


In [None]:
# Self-Exploration how TFIDF's math works

import math
from collections import defaultdict
# Building own Tfidf calculation 
# 1. Compute Term Frequency(TF): TF(t,d) = count of term t in document d / total term in document d
# 2. Compute Document Frequency(DF): DF(t) = number of documents containing term t
# 3. Compute Inverse Document Frequency(IDF): IDF(t) = log(total number of doucments/ 1+ DF(t)). 1+ make sure no zero division

def computeTfidf(corpus_data):

    # number of documents
    num_docs = len(corpus_data)

    # Counting the occurrence of the word across the document.
    df = defaultdict(int)
    for doc in corpus_data:
        for word_id, freq in doc:
            df[word_id] += 1    

    
    # TF-IDF for each doucment
    tfidf_corpus = []
    for doc in corpus_data:
        # tfidf values for each document
        doc_tfidf = []
        # consider the second value of the doc: doc is a tuple
        total_terms = sum(count for _, count in doc)

        for word_id, count in doc:
            # Term Frequency
            tf = count/ total_terms

            # Inverse Document Frequency
            idf = math.log(num_docs/(1+df[word_id]))

            # TF-IDF
            tfidf = tf * idf
            doc_tfidf.append((word_id,tfidf))

        tfidf_corpus.append(doc_tfidf)

    return  tfidf_corpus
        
computeTfidf(corpus_data)

In [6]:
# tokenize the different articles
# Return tuple with two array: n-dimensional array of tokens and  array of filename
token_articles, filename = createTokensOfDifferentArticles("../data/Custom_Articles")

# Getting corpus data
dictionary_data = createDictionary(token_articles)
corpus_data = createCorpus(dictionary_data, token_articles)


#getWordInfoFromCorpus(corpus_data, dictionary_data, 'deepseek', filename)

# Calculate the value of tfidf for each words in each articles
tfidf_data = create_Tfidf_data(corpus_data)

# extract the topic
topicOfArticle = extractTopic(tfidf_data, dictionary_data, corpus_data)

# identify the topics of each articles
topicIdentification(topicOfArticle, filename)

'college quincy massachusetts community university' are the topic of QuincyCollege_data.txt.
'applause thank america nation ever' are the topic of WH_inauguration_data.txt.
'q leavitt karoline trump room' are the topic of WH_briefingStatement.txt.
