In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.express as px

In [4]:
import os
import pandas as pd

# Define paths to training and testing directories
train_path = r'C:\Users\Dell\Desktop\UT Austin Summer\Intro to ML\STA 380\ReutersC50\C50train'
test_path = r'C:\Users\Dell\Desktop\UT Austin Summer\Intro to ML\STA 380\ReutersC50\C50test'

# Function to extract data from the given directory
def load_documents(directory, label_tag):
    document_list = []
    author_dirs = sorted(os.listdir(directory))
    
    for author in author_dirs:
        if not author.startswith('.'):
            doc_files = sorted(os.listdir(os.path.join(directory, author)))
            for doc in doc_files:
                doc_path = os.path.join(directory, author, doc)
                with open(doc_path, 'r') as file:
                    for content in file:
                        content = content.strip()
                        document_list.append([content, author, doc, label_tag])
    
    return document_list

# Load training and testing data
training_data = load_documents(train_path, label_tag='train')
testing_data = load_documents(test_path, label_tag='test')

# Convert the data to DataFrames
train_df = pd.DataFrame(training_data, columns=['Text_Content', 'Author', 'File', 'Set_Label'])
test_df = pd.DataFrame(testing_data, columns=['Text_Content', 'Author', 'File', 'Set_Label'])

# Combine train and test data if needed
combined_data = pd.concat([train_df, test_df])


In [5]:
combined_data

Unnamed: 0,Text_Content,Author,File,Set_Label
0,The Internet may be overflowing with new techn...,AaronPressman,106247newsML.txt,train
1,The National Consumers League said Wednesday t...,AaronPressman,106247newsML.txt,train
2,"The league, a non-profit consumer advocacy gro...",AaronPressman,106247newsML.txt,train
3,"The site, which collects reports directly from...",AaronPressman,106247newsML.txt,train
4,"""Consumers who suspect a scam on the Internet ...",AaronPressman,106247newsML.txt,train
...,...,...,...,...
44210,The suspension followed the magazine's publica...,WilliamKazer,58312newsML.txt,test
44211,Western diplomats and Chinese analysts have de...,WilliamKazer,58312newsML.txt,test
44212,The president of the outspoken Beijing Youth D...,WilliamKazer,58312newsML.txt,test
44213,"The clampdown has been fuelled by a call for ""...",WilliamKazer,58312newsML.txt,test


In [8]:
# Extract the training text data from the DataFrame
train_text_data = list(combined_data.loc[combined_data['Set_Label'] == 'train']['Text_Content'])

# Initialize the CountVectorizer with a maximum of 1000 features and English stop words
text_vectorizer = CountVectorizer(max_features=1000, stop_words='english')
document_term_matrix = text_vectorizer.fit_transform(train_text_data)

# Define the number of topics for LDA
n_topics = 25
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(document_term_matrix)

# Initialize a dictionary to store topics and their top words
topic_word_mapping = {}

# Extract feature names and identify top words for each topic
vocab = text_vectorizer.get_feature_names_out()
for index, topic_distribution in enumerate(lda_model.components_):
    print(f"Topic {index + 1}:")
    top_indices = topic_distribution.argsort()[:-10 - 1:-1]
    top_terms = [vocab[i] for i in top_indices]
    print(top_terms)
    topic_word_mapping[index] = [(index + 1), top_terms]


Topic 1:
['united', 'states', 'trade', 'said', 'drug', 'china', 'ban', 'department', 'colombia', 'congress']
Topic 2:
['hong', 'kong', 'china', 'said', 'tung', 'chinese', 'people', 'territory', 'rule', 'says']
Topic 3:
['internet', 'corp', 'new', 'computer', 'said', 'software', 'technology', 'microsoft', 'network', 'services']
Topic 4:
['said', 'financial', 'chairman', 'president', 'statement', 'company', 'vice', 'board', 'right', 'street']
Topic 5:
['amp', 'local', 'long', 'market', 'competition', 'service', 'phone', 'cable', 'rules', 'companies']
Topic 6:
['told', 'reuters', 'director', 'interview', 'reporters', 'quality', 'telephone', 'areas', 'conference', 'managing']
Topic 7:
['china', 'said', 'beijing', 'chinese', 'official', 'taiwan', 'officials', 'economic', 'communist', 'state']
Topic 8:
['news', 'said', 'early', 'fund', '1997', 'joint', 'year', 'venture', 'start', '1998']
Topic 9:
['000', 'tonnes', 'said', 'saying', '100', 'cocoa', 'year', 'copper', '500', 'figures']
Topic 10

In [42]:
# Function to predict the most likely topic for a new document
def determine_topic(document):
    # Convert the document into a list
    doc_as_list = [document]
    
    # Vectorize the new document using the same vectorizer
    doc_vectorized = text_vectorizer.transform(doc_as_list)
    
    # Get the topic distribution for the new document
    doc_topic_distribution = lda_model.transform(doc_vectorized)
    
    # Determine the topic with the highest probability
    for prob_distribution in doc_topic_distribution:
        highest_probability = max(prob_distribution)
        for idx, probability in enumerate(prob_distribution):
            if probability == highest_probability:
                assigned_topic = topic_word_mapping[idx][0]
    return assigned_topic

# Filter the test set from the combined data
test_set = combined_data.loc[combined_data['Set_Label'] == 'test']

# Group the test set by Author and File and combine text content
test_set_grouped = test_set.groupby(['Author', 'File']).agg(
    Combined_Text = pd.NamedAgg(column='Text_Content', aggfunc=''.join)
)

# Reset the index of the grouped DataFrame
test_set_grouped = test_set_grouped.reset_index(drop=False)

# Apply the topic prediction function to the combined text
test_set_grouped['Predicted_Topic'] = test_set_grouped['Combined_Text'].apply(lambda text: determine_topic(text))

# Display the test set with predicted topics
test_set_grouped


Unnamed: 0,Author,File,Combined_Text,Predicted_Topic
0,AaronPressman,421829newsML.txt,U.S. Senators on Tuesday sharply criticized a ...,17
1,AaronPressman,424074newsML.txt,Two members of Congress criticised the Federal...,11
2,AaronPressman,42764newsML.txt,Commuters stuck in traffic on the Leesburg Pik...,3
3,AaronPressman,43033newsML.txt,A broad coalition of corporations went to Capi...,3
4,AaronPressman,433558newsML.txt,"On the Internet, where new products come and g...",3
...,...,...,...,...
2495,WilliamKazer,504283newsML.txt,China has scored new successes in its fight ag...,10
2496,WilliamKazer,504526newsML.txt,China has scored new successes in its fight ag...,10
2497,WilliamKazer,51502newsML.txt,China is on target with plans to to promote 10...,22
2498,WilliamKazer,522090newsML.txt,China may need to adjust the mix of its treasu...,16


In [43]:
df_freq = pd.DataFrame(test_set_grouped['Predicted_Topic'].value_counts()).reset_index(drop=False)
df_freq.value_counts()

Predicted_Topic  count
1                42       1
14               166      1
24               95       1
23               137      1
22               105      1
21               72       1
20               126      1
19               93       1
18               212      1
17               150      1
16               27       1
15               77       1
13               116      1
2                159      1
12               48       1
11               48       1
10               73       1
9                82       1
8                17       1
7                255      1
6                1        1
5                75       1
4                29       1
3                214      1
25               81       1
Name: count, dtype: int64

In [45]:
df_freq = pd.DataFrame(test_set_grouped['Predicted_Topic'].value_counts()).reset_index(drop=False)
df_freq = df_freq.rename(columns={'index':'Predicted_Topic', 'count':'Value_Count'})
df_freq = df_freq.astype({'Predicted_Topic': object})
df_freq = df_freq.sort_values('Value_Count', ascending=False)

fig = px.bar(df_freq, x='Predicted_Topic', y="Value_Count", title='Test Set Frequency Count')
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = df_freq['Predicted_Topic'],
        ticktext = df_freq['Predicted_Topic'],
        title = "Identified Topic"
    ),
    yaxis_title="Frequency",
    title='Test Set Frequency Count'
)

fig.show()

In [46]:
# Group the test set by 'Author' and 'Predicted_Topic' and count the frequency of each topic per author
author_topic_df = test_set_grouped.groupby(['Author', 'Predicted_Topic']).agg(
    topic_count=pd.NamedAgg(column='Predicted_Topic', aggfunc='count')
)

# Reset the index to make 'Author' and 'Predicted_Topic' regular columns
author_topic_df = author_topic_df.reset_index(drop=False)

# Sort by 'Author' and 'topic_count', then keep the most frequent topic for each author
author_topic_df_sorted = author_topic_df.sort_values(['Author', 'topic_count']).drop_duplicates('Author', keep='last')

# Reset the index again after sorting
author_topic_df_sorted = author_topic_df_sorted.reset_index(drop=True)

# Create a bar chart to show the most frequent topic per author
bar_chart = px.bar(author_topic_df_sorted, x="Author", y="Predicted_Topic", title='Most Frequent Topic Per Author')
bar_chart.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=author_topic_df_sorted['Author'],
        ticktext=author_topic_df_sorted['Author'],
        title="Authors"
    ),
    yaxis_title="Most Frequent Topic",
    title='Most Frequent Topic Per Author'
)

# Display the bar chart
bar_chart.show()
