# IS01081790 & IS01081770

In [1]:
# For text preprocessing 
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer, PorterStemmer
 
# For topic modeling 
from gensim import corpora 
from gensim.models import LdaModel 
import pandas as pd 
 
# Download NLTK Resources 
nltk.download('stopwords') 
nltk.download('punkt') 
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raihanah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Raihanah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Raihanah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Read the data
data = pd.read_csv('news_dataset.csv')

In [3]:
# Use only the 'text' column and drop rows with null values
documents = data['text'].dropna().tolist()

In [5]:
# # Initialize stop words, lemmatizer, and stemmer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize text and convert to lowercase
    tokens = [token for token in tokens if token.isalnum() and not token.isdigit()]  # Remove non-alphanumeric tokens and numbers
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    tokens = [stemmer.stem(token) for token in tokens]  # Apply stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Apply lemmatization
    return tokens


# Preprocess each document in the list
preprocessed_documents = [preprocess_text(doc) for doc in documents]

In [6]:
# Create a Gensim Dictionary object from the preprocessed documents 
dictionary = corpora.Dictionary(preprocessed_documents)  
 
# Filter out tokens that appear in less than 15 documents or more than 50% of the documents 
dictionary.filter_extremes(no_below=15, no_above=0.5) 
 
# Convert each preprocessed document into a bag-of-words representation using the dictionary 
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]  

In [7]:
# Run LDA 
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)  # Train an LDA model on the corpus with 2 topics using Gensim's LdaModel class



In [8]:
# empty list to store dominant topic labels for each document 
article_labels = [] 
 
# iterate over each processed document 
for i, doc in enumerate(preprocessed_documents): 
    # for each document, convert to bag-of-words representation 
    bow = dictionary.doc2bow(doc) 
    # get list of topic probabilities 
    topics = lda_model.get_document_topics(bow) 
    # determine topic with highest probability 
    dominant_topic = max(topics, key=lambda x: x[1])[0] 
    # append to the list 
    article_labels.append(dominant_topic) 

In [9]:
# Create DataFrame 
df_result = pd.DataFrame({"Article": documents, "Topic": article_labels}) 
 
# Print the DataFrame 
print("Table with Articles and Topic:") 
print(df_result) 
print()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      1
1      I recently posted an article asking what kind ...      1
2      \nIt depends on your priorities.  A lot of peo...      1
3      an excellent automatic can be found in the sub...      1
4      : Ford and his automobile.  I need information...      1
...                                                  ...    ...
11091  Secrecy in Clipper Chip\n\nThe serial number o...      0
11092  Hi !\n\nI am interested in the source of FEAL ...      1
11093  The actual algorithm is classified, however, t...      0
11094  \n\tThis appears to be generic calling upon th...      3
11095  \nProbably keep quiet and take it, lest they g...      2

[11096 rows x 2 columns]



In [12]:
# Print top terms for each topic 
for topic_id in range(lda_model.num_topics): 
    print(f"Top terms for Topic #{topic_id}:") 
    top_terms = lda_model.show_topic(topic_id, topn=10) 
    print([term[0] for term in top_terms]) 
    print()


Top terms for Topic #0:
['president', 'new', 'team', 'year', 'american', 'program', 'game', 'national', 'administration', 'university']

Top terms for Topic #1:
['key', 'use', 'file', 'system', 'chip', 'encryption', 'one', 'window', 'x', 'program']

Top terms for Topic #2:
['would', 'one', 'get', 'know', 'think', 'like', 'time', 'good', 'going', 'could']

Top terms for Topic #3:
['x', 'q', 'max', 'g', 'r', 'p', 'n', 'db', 'c', 'k']

Top terms for Topic #4:
['people', 'would', 'one', 'god', 'government', 'law', 'right', 'say', 'u', 'armenian']



In [13]:
# Print the top terms for each topic with weight 
print("Top Terms for Each Topic:") 
for idx, topic in lda_model.print_topics(): 
    print(f"Topic {idx}:") 
    terms = [term.strip() for term in topic.split("+")] 
    for term in terms: 
        weight, word = term.split("*") 
        print(f"- {word.strip()} (weight: {weight.strip()})") 
        print()

Top Terms for Each Topic:
Topic 0:
- "president" (weight: 0.011)

- "new" (weight: 0.008)

- "team" (weight: 0.007)

- "year" (weight: 0.007)

- "american" (weight: 0.006)

- "program" (weight: 0.005)

- "game" (weight: 0.005)

- "national" (weight: 0.005)

- "administration" (weight: 0.005)

- "university" (weight: 0.005)

Topic 1:
- "key" (weight: 0.013)

- "use" (weight: 0.010)

- "file" (weight: 0.009)

- "system" (weight: 0.009)

- "chip" (weight: 0.007)

- "encryption" (weight: 0.006)

- "one" (weight: 0.006)

- "window" (weight: 0.006)

- "x" (weight: 0.006)

- "program" (weight: 0.005)

Topic 2:
- "would" (weight: 0.014)

- "one" (weight: 0.011)

- "get" (weight: 0.010)

- "know" (weight: 0.009)

- "think" (weight: 0.009)

- "like" (weight: 0.009)

- "time" (weight: 0.007)

- "good" (weight: 0.006)

- "going" (weight: 0.006)

- "could" (weight: 0.006)

Topic 3:
- "x" (weight: 0.093)

- "q" (weight: 0.091)

- "max" (weight: 0.080)

- "g" (weight: 0.051)

- "r" (weight: 0.051)

-