# Group Members: 
# 1. Ayman Fikry bin Asmajuda (IS01081779)
# 2. Muhammad Khairin Asnawi bin Rosli (IS01082068) 

# 1.0 Read the Data

In [1]:
import pandas as pd

In [2]:
# Load the dataset
data = pd.read_csv('news_dataset.csv', nrows=1000)

# Preview the head of the data
data.head()

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2022-08-02 13:48:37.251043
1,17,I recently posted an article asking what kind ...,7,rec.autos,2022-08-02 13:48:37.251043
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2022-08-02 13:48:37.251043
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2022-08-02 13:48:37.251043
4,64,: Ford and his automobile. I need information...,7,rec.autos,2022-08-02 13:48:37.251043


In [3]:
# Select Score and Text column only
data = data[['text']]

# 2.0 Perform Text Pre-processing 

In [4]:
# Find the number of duplicated rows
duplicate_text = data.duplicated()
print(duplicate_text.sum())

41


In [5]:
# Remove the duplicated rows except the first one
data = data.drop_duplicates(keep='first')
data.shape

(959, 1)

In [6]:
# Check any rows available
print(data) 

                                                  text
0    I was wondering if anyone out there could enli...
1    I recently posted an article asking what kind ...
2    \nIt depends on your priorities.  A lot of peo...
3    an excellent automatic can be found in the sub...
4    : Ford and his automobile.  I need information...
..                                                 ...
995  Hi there,\n\nwhen I run Disk First Aid on my e...
996  \nI agree completely, but there was only a ref...
997  \n\n\n\n\n\n\nAlso, has anyone heard any rumor...
998  \n\nSince I repost this message again for the ...
999  Hi!\n\nI am looking for ftp sites (where there...

[959 rows x 1 columns]


## 2.1 Cleaning & Standardization

In [7]:
import re

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)

    # Remove HTML tags
    cleaned_text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    cleaned_text = re.sub(r'http\S+', '', cleaned_text)
    
    # Remove special characters and digits
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text)
    
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    
    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    return cleaned_text.strip()

# Apply the clean_text function to the 'Text' column
data['text'] = data['text'].apply(clean_text)

# View the cleaned text data
data.head()


Unnamed: 0,text
0,i was wondering if anyone out there could enli...
1,i recently posted an article asking what kind ...
2,it depends on your priorities a lot of people ...
3,an excellent automatic can be found in the sub...
4,ford and his automobile i need information on ...


## 2.2 Remove null values

In [8]:
# Check for missing values
data.isnull().sum()

text    0
dtype: int64

In [9]:
# Select relevant columns for sentiment analysis
data = data[['text']]

## 2.3 Tokenization & Stopword Removal

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

# Tokenize the text into individual words
data['Tokens'] = data['text'].apply(tokenize_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2.4 Lemmatization 

In [11]:
# Download the WordNet lemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize the tokens
data['Tokens'] = data['Tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [13]:
# Join the tokens back into sentences
data['Preprocessed_Text'] = data['Tokens'].apply(lambda x: ' '.join(x))

# Save the preprocessed data to a new CSV file
data.to_csv('processed.csv', index=False)

# Preview the preprocessed data
print(data)

                                                  text  \
0    i was wondering if anyone out there could enli...   
1    i recently posted an article asking what kind ...   
2    it depends on your priorities a lot of people ...   
3    an excellent automatic can be found in the sub...   
4    ford and his automobile i need information on ...   
..                                                 ...   
995  hi there when i run disk first aid on my exter...   
996  i agree completely but there was only a refund...   
997  also has anyone heard any rumors that the new ...   
998  since i repost this message again for the seco...   
999  hi i am looking for ftp sites where there are ...   

                                                Tokens  \
0    [wondering, anyone, could, enlighten, car, saw...   
1    [recently, posted, article, asking, kind, rate...   
2    [depends, priority, lot, people, put, higher, ...   
3    [excellent, automatic, found, subaru, legacy, ...   
4    [ford, a

# 3.0 Perform LDA using Gensim

In [14]:
# For topic modeling
from gensim import corpora
from gensim.models import LdaModel

In [15]:
# Split the preprocessed text into a list of lists of tokens
preprocessed_documents = [doc.split() for doc in data['Preprocessed_Text']]

# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)

# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

# Train an LDA model on the corpus
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Display the topics found by the LDA model
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.011*"mac" + 0.007*"bit" + 0.007*"apple" + 0.007*"mb" + 0.007*"card"')
(1, '0.011*"car" + 0.007*"oil" + 0.006*"problem" + 0.006*"get" + 0.006*"would"')
(2, '0.016*"car" + 0.010*"would" + 0.009*"like" + 0.008*"one" + 0.006*"good"')
(3, '0.019*"car" + 0.005*"would" + 0.005*"one" + 0.005*"get" + 0.005*"year"')
(4, '0.008*"drive" + 0.008*"car" + 0.005*"get" + 0.005*"know" + 0.005*"send"')


In [16]:
# empty list to store dominant topic labels for each document
article_labels = []

# iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
    # for each document, convert to bag-of-words representation
    bow = dictionary.doc2bow(doc)
    # get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # append to the list
    article_labels.append(dominant_topic)

# Create DataFrame
df_result = pd.DataFrame({"Article": data['Preprocessed_Text'], "Topic": article_labels})

# Print the DataFrame
print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                               Article  Topic
0    wondering anyone could enlighten car saw day d...      2
1    recently posted article asking kind rate singl...      3
2    depends priority lot people put higher priorit...      2
3    excellent automatic found subaru legacy switch...      2
4    ford automobile need information whether ford ...      3
..                                                 ...    ...
995  hi run disk first aid external hard drive quan...      2
996  agree completely refund people bought gc quadr...      0
997  also anyone heard rumor new dock one cpu bette...      0
998  since repost message second time hope hear fol...      4
999  hi looking ftp site freeware shareware mac hel...      0

[959 rows x 2 columns]



In [17]:
# Print the top terms for each topic
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "mac" (weight: 0.011)
- "bit" (weight: 0.007)
- "apple" (weight: 0.007)
- "mb" (weight: 0.007)
- "card" (weight: 0.007)
- "one" (weight: 0.006)
- "use" (weight: 0.005)
- "system" (weight: 0.005)
- "problem" (weight: 0.005)
- "machine" (weight: 0.005)

Topic 1:
- "car" (weight: 0.011)
- "oil" (weight: 0.007)
- "problem" (weight: 0.006)
- "get" (weight: 0.006)
- "would" (weight: 0.006)
- "tire" (weight: 0.005)
- "brake" (weight: 0.004)
- "one" (weight: 0.004)
- "engine" (weight: 0.004)
- "dont" (weight: 0.003)

Topic 2:
- "car" (weight: 0.016)
- "would" (weight: 0.010)
- "like" (weight: 0.009)
- "one" (weight: 0.008)
- "good" (weight: 0.006)
- "dont" (weight: 0.005)
- "get" (weight: 0.005)
- "time" (weight: 0.005)
- "speed" (weight: 0.004)
- "new" (weight: 0.004)

Topic 3:
- "car" (weight: 0.019)
- "would" (weight: 0.005)
- "one" (weight: 0.005)
- "get" (weight: 0.005)
- "year" (weight: 0.005)
- "dealer" (weight: 0.004)
- "also" (weight: 0.004)
- "muc

# 4.0 Evaluate LDA using Coherence Score

In [18]:
# import library for Coherence Score

from gensim.models.coherencemodel import CoherenceModel

In [19]:
# Calculate the coherence score for the LDA model

coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()

In [20]:
# Display the score

print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.3560


# 5.0 Interpret the result
### First and foremost, topic 0 appears to be more focus on general conversations on computers, where the weight of terms like "mac" seems to be the highest which indicate the importance of the term for this topic.
### In addition to that, topic 1, 2 and 3 emphasize towards vehicles, where the weight of terms like "car" are relatively high, which suggest a strong association for the three topics.
### The final topic, topic 4, seems to be related around vehicles and driving, where the weight of terms like "drive" and "car" are particularly high, indicating their significance in this topic.
### There is an obvious repitition of terms as the term "car" appears often in practically most of the topics, suggesting that the corpus is mostly concerned with conversations concerning vehicles particularly cars. This might imply that the dataset mostly relates to automobiles and associated problems.
### On a similar note, Topic 0 is unique in that it appears to be concentrated on technology and Apple products. This suggests that a portion of the conversations on a separate topic may be included in the dataset.
### The results shows a topic coherence score of 0.356 which indicates a moderate level of coherence. This implies that although the topics determined by the LDA model make sense in certain ways, there are still some improvements or enhancements to be made. 