In [None]:
import pandas as pd
import gensim
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.decomposition import LatentDirichletAllocation

# read_file and convert type
data = pd.read_csv('../resources/cleaned_processdata.csv')
data['Processed_Abstracts'] = data['Processed_Abstracts'].astype(str)

In [None]:
# Prepare text data
texts = [doc.split() for doc in data['Processed_Abstracts']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Find optimal number of topics

In [None]:
# import matplotlib.pyplot as plt

# # Function to compute coherence for different topic numbers
# def compute_coherence_values(dictionary, corpus, texts, start, stop, step):
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, stop, step):
#         model = gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=5)
#         model_list.append(model)
#         coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherence_model.get_coherence())
#     return model_list, coherence_values

# # Define range for topics
# start, stop, step = 2, 12, 2  # Adjust range for quicker runtime

# # Compute coherence scores
# model_list, coherence_values = compute_coherence_values(dictionary, corpus, texts, start, stop, step)

# # Plot coherence values
# x = range(start, stop, step)
# plt.plot(x, coherence_values)
# plt.xlabel("Number of Topics")
# plt.ylabel("Coherence Score")
# plt.title("Optimal Number of Topics")
# plt.show()

# # Find the optimal number of topics
# optimal_num_topics = x[coherence_values.index(max(coherence_values))]
# optimal_num_topics


In [None]:
optimal_num_topics = 8

# Fit the optimal LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=10)

# Get top words for each topic
top_words_per_topic = []
for topic_id in range(optimal_num_topics):
    top_words = lda_model.show_topic(topic_id, topn=10)  # Get top 10 words
    top_words_per_topic.append([word for word, _ in top_words])

Automate Topic name

In [None]:
import requests

# Your API Key (replace with your actual key)
API_KEY = "sk-7EfUfbCz6VIESFu4Nt9qjquhQnhPOvPbKtkZhWC71XTIbuTC"
API_URL = "https://api.opentyphoon.ai/v1/chat/completions"

# Function to generate topic labels using Typhoon API
def generate_topic_label_typhoon(topic_keywords):
    # Refined prompt for descriptive and concise topic labels
    prompt = (
        f"Based on the following keywords, create a concise, descriptive label that summarizes the main topic: "
        f"{', '.join(topic_keywords)}."
    )

    # Construct the payload
    payload = {
        "model": "typhoon-v1.5x-70b-instruct",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 50,
        "temperature": 0.6,
        "top_p": 0.95,
        "repetition_penalty": 1.05,
        "stream": False
    }

    # Send the request to the Typhoon API
    response = requests.post(
        API_URL,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {API_KEY}"
        },
        json=payload
    )

    # Check for successful response
    if response.status_code == 200:
        response_data = response.json()
        label = response_data["choices"][0]["message"]["content"].strip()
        return label
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

# Generate and refine labels for all topics
topic_labels = [generate_topic_label_typhoon(words) for words in top_words_per_topic]

# Print the refined labels in the desired format
for idx, label in enumerate(topic_labels):
    print(f"Topic {idx}: \"{label}\"")




Save topic_name

In [None]:
# cutting topic_labels text ""

topic_labels = [label.replace('"', '') for label in topic_labels]
topic_labels

# Save to CSV
df = pd.DataFrame(topic_labels)
df.to_csv('../resources/topic_name.csv', index=False, encoding='utf-8')
print(f"Data saved to 'topic-name.csv'. Fetched {len(df)} results.")

Save top_words_per_topic

In [None]:
# top_words_per_topic

# data.head()
df = pd.DataFrame(top_words_per_topic)
# Save to CSV
if not df.empty:
    df.to_csv('../resources/cleaned_topicdata.csv', index=False, encoding='utf-8')
    print(f"Data saved to 'cleaned_topicdata.csv'. Fetched {len(df)} results.")
else:
    print("No data fetched. Check the scraping process.")
