In [None]:
import gensim
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gsdmm import MovieGroupProcess

# Load your dataset
df = pd.read_excel('/Users/raphael/Desktop/Twitter Data/MergedFile_s_t_cleaned_copy.xlsx')
df = df.dropna(subset = 'new_text_farm')

# transform text to vector form
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_df=0.9, min_df=5)
# apply transformation
tf = vectorizer.fit_transform(df['new_text_farm'])
tf = tf.toarray()

In [None]:
# Apply the NMF model to short-text (tweet) topic modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

df = df.dropna(subset = 'new_text_farm')
# Create a TfidfVectorizer object to convert the text data to a matrix of TF-IDF features
vectorizer = TfidfVectorizer(max_df=0.9, min_df=5)

# Fit and transform the text data into TF-IDF features
tfidf = vectorizer.fit_transform(df['new_text_farm'])

# Create an NMF object with n topics
nmf = NMF(n_components=23)

# Fit the NMF model to the TF-IDF features
nmf.fit(tfidf)
feature_names = vectorizer.get_feature_names_out()

# Show the top 12 most likely words in a topic
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i]) for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i]) for i in topic.argsort()[:-no_top_words - 1:-1]]                       
    return pd.DataFrame(topic_dict)

no_top_words = 12
topic_df = display_topics(nmf, feature_names, no_top_words)
display(topic_df)


In [None]:
# Find the representative sentence of each topic

# Get the topic assignments for each document
doc_topic_probs = nmf.transform(tfidf)

# Get the topic-word probabilities
topic_word_probs = nmf.components_

#Create an empty list for sentences
topic_sentences = []
representative = []

import nltk
from nltk.tokenize import sent_tokenize

topic_index = np.argmax(doc_topic_probs, axis=1)

# Loop through each topic
for topic_idx in range(nmf.n_components):
    # Loop through each document to categorize the sentences
    for i in range(len(doc_topic_probs)):
        if topic_index[i] == topic_idx:
            sentences = sent_tokenize(df['new_text_farm'].iloc[i])  # Tokenize document into sentences
            topic_sentences.extend(sentences)  # Add sentences to topic_sentences list

    # Get the sentence with the highest topic probability
    if topic_sentences:
        topic_sentence_probs = nmf.transform(vectorizer.transform(topic_sentences))
        most_representative_sentence_idx = np.argmax(topic_sentence_probs[:, topic_idx])
        most_representative_sentence = topic_sentences[most_representative_sentence_idx]
        representative.append(f"Topic {topic_idx}: {most_representative_sentence}")
    else:
        representative.append(f"Topic {topic_idx}: No sentence available")
        
SentenceList = pd.DataFrame(representative)
print(SentenceList)

In [None]:
# Create Document — Topic Matrix

output = nmf.transform(tfidf) 
# column names
topicnames = ['Topic' + str(i) for i in range(nmf.n_components)]
# index names
docnames = ['Post' + str(i) for i in range(len(df['new_text_farm']))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(output, columns=topicnames, index=docnames)
# Normalize the values in the DataFrame
for i in range(len(df_document_topic)):
    row_sum = df_document_topic.iloc[i].sum()
    df_document_topic.iloc[i] = df_document_topic.iloc[i] / row_sum  #normalize the matrix
# Get dominant topic for each document
dominant_topic = np.where(df_document_topic.max(axis=1)>=0.35, np.argmax(df_document_topic.values, axis=1), np.nan)

dominant_topic1 = df_document_topic.apply(lambda x: x.argsort()[::-1][1], axis=1)


df_document_topic['dominant_topic'] = dominant_topic
df_document_topic['dominant_topic1'] = dominant_topic1

print(df_document_topic.head())