# Imports

In [None]:
from pprint import pprint
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import LdaModel
from gensim.matutils import corpus2csc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import re
import string

# import pyLDAvis.gensim
# import pickle
# import pyLDAvis


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Data Importing

In [None]:
df = pd.read_csv('/content/FinalCorpus2.csv')

In [None]:
df.head()

Unnamed: 0,Generated_Name,text
0,ATCM45_ip001_e.docx,report by the cep observer to the xxxvii scar ...
1,ATCM45_ip002_e.docx,report by the ccamlr observer to the forty fif...
2,ATCM45_ip003_rev1_e.docx,report by the united kingdom as depositary gov...
3,ATCM45_ip004_e.docx,report of the depositary government of the ant...
4,ATCM45_ip005_e.docx,republic of belarus in the systemof the antarc...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44907 entries, 0 to 44906
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Generated_Name  44907 non-null  object
 1   text            44907 non-null  object
dtypes: object(2)
memory usage: 701.8+ KB


# Data Cleaning

In [None]:
extended_stopwords = stopwords.words('english')

In [None]:
# Make sure to download the necessary NLTK data
def preprocess_text(text: str) -> str:
    """
    Preprocess a given text by removing punctuation, special characters, digits,
    and then lemmatizing all the words.

    Parameters:
    text (str): The input text to be preprocessed.

    Returns:
    str: The preprocessed text.
    """
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = cleaned_text.lower()
    tokens = nltk.word_tokenize(cleaned_text)
    extended_stopwords = stopwords.words('english') + SW_list
    tokens = [word for word in tokens if word not in extended_stopwords and len(word) > 1]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text


In [None]:
df['Text_preprocessed'] = df['text'].apply(preprocess_text)

# Topic Modelling

## Scikit Learn LDA

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


def assign_topics_to_dataframe_LDA_V1(df, text_column, n_topics=50, n_top_words=20):
    """
    Read text data from a pandas DataFrame, perform topic modeling,
    and assign the main topic (most important words) to each text in a new column.

    :param df: Input pandas DataFrame
    :param text_column: Name of the column containing text data
    :param n_features: The maximum number of features (default is 1000)
    :param n_topics: The number of topics to extract (default is 30)
    :param n_top_words: The number of top words to extract for each topic (default is 10)

    :return: DataFrame with additional columns containing the main topic and scores for each text
    """
    stop_words = stopwords.words('english')
    stop_words.extend(SW_list)

    count_vectorizer = CountVectorizer(max_df=0.85, min_df=0.10, stop_words=stop_words)
    dtm = count_vectorizer.fit_transform(df[text_column])

    lda = LatentDirichletAllocation(n_components=n_topics, learning_method='online', learning_offset=50., random_state=1, verbose=1).fit(dtm)

    tf_feature_names = count_vectorizer.get_feature_names_out()
    topic_dict = {}
    for topic_idx, topic in enumerate(lda.components_):
        important_features = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topic_dict[topic_idx] = ', '.join(important_features)

    topic_assignments = lda.transform(dtm).argmax(axis=1)
    topic_scores = lda.transform(dtm)
    for i in range(n_topics):
        df[f"Topic {i} Score"] = topic_scores[:, i]

    for topic_idx, topic_words in topic_dict.items():
        print(f"Topic #{topic_idx}: {topic_words}")

    return df
df = assign_topics_to_dataframe_LDA_V1(df,'Text_preprocessed')
df = df.drop(columns=['Text_preprocessed'])
df = df.drop(columns=['text'])
df.to_csv('out_Scikit_50_stopwordsExtended_TopicLoadingScores.csv')

## Gensim Multicore LDA

In [None]:
import pandas as pd
from gensim import corpora
from nltk.corpus import stopwords
!python -m spacy download en_core_web_sm
import spacy
from gensim.models import CoherenceModel
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in extended_stopwords] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def assign_topics_to_dataframe_LDA_Optimized(df, text_column, n_topics=30, n_top_words=20):
    """
    Read text data from a pandas DataFrame, perform topic modeling,
    and assign the main topic (most important words) to each text in a new column.

    :param df: Input pandas DataFrame
    :param text_column: Name of the column containing text data
    :param n_topics: The number of topics to extract (default is 30)
    :param n_top_words: The number of top words to extract for each topic (default is 10)

    :return: DataFrame with additional columns containing the main topic and scores for each text
    """


    data = df[text_column].values.tolist()
    data_words = list(sent_to_words(data))
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    data_words_nostops = remove_stopwords(data_words)
    data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    id2word = corpora.Dictionary(data_lemmatized)
    texts = data_lemmatized
    corpus = [id2word.doc2bow(text) for text in texts]

    lda = gensim.models.LdaMulticore(corpus=corpus,
                                     id2word=id2word,
                                     num_topics=n_topics,
                                     random_state=42,
                                     alpha=0.1,
                                     eta='auto',
                                     chunksize=100,
                                     passes=10,
                                     per_word_topics=True)

    # Assign topics
    topic_assignments = []
    topic_scores = []
    pprint(lda.print_topics())
    print('\n\n\n\n')
    topics = lda.show_topics(num_topics=n_topics, num_words=n_top_words, log=False, formatted=False)
    for topic_id, topic in topics:
        print(f"Topic: {topic_id}")
        print(f"Words: {[word for word, _ in topic]}")
    topic_scores_matrix = []

    for text in corpus:
        topic_prob = lda.get_document_topics(text, minimum_probability=0) # get scores for all topics
        topic_scores = [0] * n_topics  # Initialize topic scores with zeros
        for topic, score in topic_prob:
            topic_scores[topic] = score
        topic_scores_matrix.append(topic_scores)
    print(topic_scores_matrix,'\n\n\n\n\n\n\n')

    # Add topic scores to the dataframe
    for i in range(n_topics):
        df.loc[:,f'Topic_{i}_Score'] = [row[i] for row in topic_scores_matrix]
    for text in corpus:
        topic_prob = lda.get_document_topics(text)
        if topic_prob:
            topic_prob.sort(key=lambda x: x[1], reverse=True) # sort by probability
            topic_assignments.append(topic_prob[0][0])
            topic_scores.append(topic_prob[0][1])
        else:
            topic_assignments.append(None)
            topic_scores.append(None)

    return df
df = assign_topics_to_dataframe_LDA_Optimized(df,'text')
df = df.drop(columns=['Text_preprocessed'])
df = df.drop(columns=['text'])
df.to_csv('out_Gensim_30.csv')

# Reformatting the outputs

In [None]:
import csv

# Step 1: Read the text file and prepare the data
data_dict = {}
with open('/content/Topics.txt', 'r') as file:
    for line in file:
        # Split the line into topic and data, strip any whitespace, and ignore empty lines
        line_content = line.strip()
        if line_content:  # if line is not empty
            topic, data = line_content.split(":", 1)  # Only split on the first colon
            data_items = [item.strip() for item in data.split(',')]  # Split data into individual pieces

            # Store data items under the topic in a dictionary
            data_dict[topic.strip()] = data_items

# Find the maximum number of rows we'll have in the CSV
max_rows = max(len(items) for items in data_dict.values())

# Step 2: Write the data to a CSV file
with open('/content/Topics.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Write the header (the topics)
    writer.writerow(data_dict.keys())

    # Write the data rows
    for i in range(max_rows):
        row = []
        for items in data_dict.values():
            # Add an item to the row if it exists, otherwise add an empty string
            row.append(items[i] if i < len(items) else '')
        writer.writerow(row)
df_topics = pd.read_csv('/content/Topics.csv')
df_topics = df_topics.T
df_topics.to_csv('/content/Topics.csv')