In [None]:
!pip install bertopic
!pip install FuzzyTM
!pip install --upgrade tensorflow
!pip install --upgrade umap-learn
!pip install --upgrade bertopic
!pip install --upgrade gensim
!pip install --upgrade spacy
!pip install numpy



# NLP - Assignment 2

## Imports

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from bertopic import BERTopic
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from FuzzyTM import FLSA_W
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import gensim
from gensim import corpora

## Preprocess Data

In [None]:
path = "./us_equities_news_dataset.csv"

# Load the news dataset
news_dataset = pd.read_csv("./us_equities_news_dataset.csv")
news_dataset.head()

In [None]:
# Filter dataset to only include articles with 'Nvidia' in the content
nvidia_dataset = news_dataset[news_dataset['content'].str.contains('Nvidia', case=False, na=False)]

In [None]:
# Concatenate title and content columns
nvidia_dataset['content'] = nvidia_dataset['title'] + ' ' + nvidia_dataset['content']

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text,
                    remove_punctuation=True,
                    remove_stopwords=True,
                    lemmatize=False,
                    stem=False,
                    remove_short_words=False,
                    remove_rare_words=False,
                    remove_numbers=True,
                    min_word_length=2):
    """
    Advanced preprocessing function that applies different levels of text processing.

    Parameters:
    - text: The text to preprocess.
    - remove_punctuation: Whether to remove punctuation from the text.
    - remove_stopwords: Whether to remove common stopwords.
    - lemmatize: Whether to apply lemmatization to reduce words to their root form.
    - stem: Whether to apply stemming to reduce words to their base form.
    - remove_short_words: Whether to remove short words from the text.
    - remove_rare_words: Whether to remove rare words based on the dataset distribution.
    - remove_numbers: Whether to remove numbers from the text.
    - min_word_length: The minimum length of words to keep in the text.

    Returns:
    - Preprocessed text as tokens.
    """

    # Tokenization
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove numbers if specified
    if remove_numbers:
        tokens = [re.sub(r'\d+', '', token) for token in tokens]

    # Remove non-alphabetic characters (punctuation)
    if remove_punctuation:
        tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]

    # Remove stopwords
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    if lemmatize:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stemming (alternative to lemmatization)
    if stem:
        tokens = [stemmer.stem(token) for token in tokens]

    # Remove short words
    if remove_short_words:
        tokens = [token for token in tokens if len(token) >= min_word_length]

    return tokens

In [None]:
def apply_preprocessing(nvidia_dataset, version='v1'):
    """
    Apply different levels of preprocessing to the dataset.

    Parameters:
    - nvidia_dataset: The Nvidia articles with a 'content' column.
    - version: The version of preprocessing to apply ('v1', 'v2', 'v3', or 'v4').

    Returns:
    - DataFrame with the original content and the preprocessed content in 'preprocessed_content' column.
    """

    if version == 'v1':
        # Basic tokenization and lowercasing
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=False, remove_stopwords=False,
            lemmatize=False, remove_numbers=False, stem=False,
            remove_short_words=False
        ))

    elif version == 'v2':
        # Remove punctuation, stopwords, and numbers, but no lemmatization/stemming
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=True, remove_stopwords=True,
            lemmatize=False, remove_numbers=False, stem=False,
            remove_short_words=False
        ))

    elif version == 'v3':
        # Advanced preprocessing with stemming, number removal, short words removal
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=True, remove_stopwords=True,
            lemmatize=False, stem=True, remove_numbers=True,
            remove_short_words=True, min_word_length=2
        ))

    elif version == 'v4':
        # Full preprocessing with lemmatization instead of stemming, number removal, and short words removal
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=True, remove_stopwords=True,
            lemmatize=True, remove_numbers=True, stem=False,
            remove_short_words=True, min_word_length=2
        ))

    else:
        raise ValueError("Invalid preprocessing version. Choose from 'v1', 'v2', 'v3', or 'v4'.")

    # Return the DataFrame with original and preprocessed content
    return nvidia_dataset

In [None]:
train_data_v1 = apply_preprocessing(nvidia_dataset, version='v1')  # Basic preprocessing
train_data_v2 = apply_preprocessing(nvidia_dataset, version='v2')  # Intermediate preprocessing
train_data_v3 = apply_preprocessing(nvidia_dataset, version='v3')  # Full preprocessing with stemming
train_data_v4 = apply_preprocessing(nvidia_dataset, version='v4')  # Full preprocessing with lemmatizer

train_data_v1.head()

In [None]:
# train_data_head = train_data_v1.head()

# # Specify the filename for the Excel file
# output_file = 'train_data_v1_head.xlsx'

# # Save to Excel
# train_data_head.to_excel(output_file, index=False)

## Helper functions

### LDA

In [None]:
def compute_coherence_values_lda(train_data_list, dictionary, max_topics=20):
    """
    Compute coherence values for different numbers of topics in LDA model.

    Parameters:
    - train_data_list: List of tokenized documents.
    - dictionary: Gensim dictionary.
    - max_topics: Maximum number of topics to test.

    Returns:
    - coherence_values: List of coherence values for each number of topics.
    - optimal_num_topics: The optimal number of topics based on maximum coherence.
    """

    # Create Document-Term Matrix
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in train_data_list]

    coherence_values = []

    # Iterate over different numbers of topics
    for num_topics in range(2, max_topics + 1):
        lda_model = gensim.models.LdaModel(
            doc_term_matrix,
            num_topics=num_topics,
            id2word=dictionary,
            random_state=42,
            passes=10,
            alpha='auto'
        )

        # Compute coherence score
        coherence_model_lda = CoherenceModel(model=lda_model, texts=train_data_list, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model_lda.get_coherence())

    # Find the optimal number of topics (maximum coherence)
    optimal_num_topics = coherence_values.index(max(coherence_values)) + 2  # +2 because range starts from 2

    return coherence_values, optimal_num_topics

In [None]:
def plot_coherence_scores_lda(coherence_values, max_topics):
    """
    Plot the coherence scores to visualize the elbow method.

    Parameters:
    - coherence_values: List of coherence values.
    - max_topics: Maximum number of topics tested.
    """
    x = range(2, max_topics + 1)
    plt.figure(figsize=(10, 6))
    plt.plot(x, coherence_values, marker='o')
    plt.title('Coherence Scores vs. Number of Topics')
    plt.xlabel('Number of Topics')
    plt.ylabel('Coherence Score')
    plt.xticks(x)
    plt.grid()
    plt.show()

### BERTopic

In [None]:
def print_bertopic_topics(topic_model):
    """
    Print topics generated by BERTopic.

    Parameters:
    - topic_model: the trained BERTopic model
    """
    topics = topic_model.get_topics()
    for topic_num, words in topics.items():
        # Ignore the '-1' topic, which is typically noise in BERTopic
        if topic_num == -1:
            continue
        print(f"Topic {topic_num}: {', '.join([word[0] for word in words])}")

## Train Topic Models

In [None]:
def train_topic_model(train_data, model_type='LDA', max_topics=20):
    """
    Train a topic model on the given training data.

    Parameters:
    - train_data: list of str, the text to train the model on
    - model_type: str, the type of model to train ('LDA', 'FLSA-W', 'BERTopic')
    - num_topics: int, the number of topics to generate

    Returns:
    - model: the trained model
    - topics: the topics generated by the model
    """

    train_data_list_string = [' '.join(tokens) for tokens in train_data['preprocessed_content']]
    train_data_list_tokens = train_data['preprocessed_content'].tolist()  # This should already be a list of lists


    if model_type == 'LDA':
        # Create Gensim dictionary
        dictionary = corpora.Dictionary(train_data_list_tokens)

        # Compute coherence values and find optimal number of topics
        coherence_values, optimal_num_topics = compute_coherence_values_lda(train_data_list_tokens, dictionary, max_topics)

        # Plot coherence scores
        plot_coherence_scores_lda(coherence_values, max_topics)

        # Create Document-Term Matrix
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in train_data_list_tokens]

        # Train LDA model with optimal number of topics
        lda_model = gensim.models.LdaModel(
            doc_term_matrix,
            num_topics=optimal_num_topics,
            id2word=dictionary,
            random_state=42,
            passes=10,
            alpha='auto'
        )

        # Get topics (top words in each topic)
        topics = lda_model.print_topics(num_words=10)

        return lda_model, topics

    elif model_type == 'FLSA-W':
        # Tokenized input is passed directly to FLSA-W
        tokenized_data = [tokens for tokens in train_data['preprocessed_content']]

        # Initialize FLSA-W model with tokenized data directly
        flsa_w_model = FLSA_W(
            input_file=tokenized_data,
            num_topics=10,
            num_words=10
        )

        # Train the FLSA-W model
        pwgt, ptgd = flsa_w_model.get_matrices()  # This trains the model

        # Get topics as words
        topics = flsa_w_model.show_topics(representation='words')

        return flsa_w_model, topics

    elif model_type == 'BERTopic':
        # Train BERTopic model
        topic_model = BERTopic()
        topics, _ = topic_model.fit_transform(train_data_list_string)

        return topic_model, topics

    else:
        raise ValueError("Invalid model_type. Choose from 'LDA', 'FLSA-W', 'BERTopic'.")

In [None]:
def train_on_versions(data_versions, model_type, max_topics=20):
    """
    Train a topic model on different versions of the data.

    Parameters:
    - data_versions: list of DataFrames, each containing a separate version of preprocessed data.
    - model_type: str, the type of model to train ('LDA', 'FLSA-W', 'BERTopic')
    - max_topics: int, the maximum number of topics to generate

    Returns:
    - models: list of trained models
    - topics: list of topics generated by the models
    """

    models = []
    topics = []

    # Train model on different versions
    for data_v in data_versions:
        print(f"Training model on version: {data_v}")
        model, topic = train_topic_model(data_v, model_type=model_type, max_topics=max_topics)
        print(f"Finished training model on version: {data_v}")
        models.append(model)
        topics.append(topic)

    return models, topics

## Evaluation

### Qualitative

In [None]:
def inspect_model(topic_model):
    """
    Inspect the topics generated by a trained topic model.

    Parameters:
    - topic_model: the trained topic model.
    """
    return topic_model.get_topics(), topic_model.get_topic_embeddings(), topic_model.get_vocabulary(), topic_model.get_vocabulary_size()

### Quantitative

In [None]:
def evaluate_model(topic_model):
    """
    Evaluate the quality of a trained topic model using various metrics.

    Parameters:
    - topic_model: the trained topic model.
    """
    return topic_model.get_coherence_score(), topic_model.get_diversity_score(), topic_model.get_interpretability_score()

## Iteration 1

### LDA

In [None]:
LDA_model, LDA_topics = train_topic_model(train_data_v1, model_type='LDA', max_topics=20)

# Print topics
for topic in LDA_topics:
    print(topic)

### FLSA-W

In [None]:
# Train FLSA-W model on the first version of the preprocessed dataset
flsa_w_model, flsa_w_topics = train_topic_model(train_data_v1, model_type='FLSA-W', max_topics=10)

# Print topics
for topic in flsa_w_topics:
    print(topic)

In [None]:
# # Print the generated FLSA-W topics
# print("Initial FLSA-W Topics:")
# print_topics_lda(flsa_w_topics)

In [None]:
def compute_coherence_flsa_w(train_data, max_topics=10):
    """
    Compute coherence scores for FLSA-W model with varying number of topics.
    Parameters:
        - train_data: list of tokenized documents (preprocessed content).
        - max_topics: int, the maximum number of topics to test.
    Returns:
        - topic_nums: list of topic numbers used for testing.
        - coherence_scores: list of coherence scores for each number of topics.
    """

    tokenized_data = [tokens for tokens in train_data['preprocessed_content']]

    topic_nums = []
    coherence_scores = []

    # Try different numbers of topics
    for num_topics in range(3, max_topics + 1):
        # Train FLSA-W model
        flsa_w_model = FLSA_W(
            input_file=tokenized_data,
            num_topics=num_topics,
            num_words=10
        )

        # Train the FLSA-W model
        flsa_w_model.get_matrices()

        # Get the coherence score for the current model
        coherence_score = flsa_w_model.get_coherence_score()
        print(f"Number of topics: {num_topics}, Coherence score: {coherence_score}")

        # Store the results
        topic_nums.append(num_topics)
        coherence_scores.append(coherence_score)

    return topic_nums, coherence_scores


def plot_elbow_curve(topic_nums, coherence_scores):
    """
    Create an elbow plot for the coherence scores vs. number of topics.

    Parameters:
    - topic_nums: list of number of topics tested.
    - coherence_scores: list of coherence scores for each number of topics.
    """
    plt.figure(figsize=(8, 5))
    plt.plot(topic_nums, coherence_scores, marker='o')
    plt.title('FLSA-W Elbow Plot: Number of Topics vs Coherence Score')
    plt.xlabel('Number of Topics')
    plt.ylabel('Coherence Score')
    plt.grid(True)
    plt.show()

In [None]:
# Train the model and compute coherence scores
topic_nums, coherence_scores = compute_coherence_flsa_w(train_data_v1, max_topics=15)

# Plot the elbow curve
plot_elbow_curve(topic_nums, coherence_scores)

### BERTopic

In [None]:
# Train BERTopic model on the first version of the preprocessed dataset
bertopic_model, bertopic_topics = train_topic_model(train_data_v1, model_type='BERTopic')

# Print the generated BERTopic topics
print("BERTopic Topics:")
print_bertopic_topics(bertopic_model)

### BERTopic

## Iteration 2

In [None]:
# Train BERTopic model on the first version of the preprocessed dataset
bertopic_model, bertopic_topics = train_topic_model(train_data_v2, model_type='BERTopic')

# Print the generated BERTopic topics
print("BERTopic Topics:")
print_bertopic_topics(bertopic_model)

## Iteration 3

### BERTopic

In [None]:
# Train BERTopic model on the first version of the preprocessed dataset
bertopic_model, bertopic_topics = train_topic_model(train_data_v3, model_type='BERTopic')

# Print the generated BERTopic topics
print("BERTopic Topics:")
print_bertopic_topics(bertopic_model)

## Iteration 4

### BERTopic

In [None]:
# Train BERTopic model on the first version of the preprocessed dataset
bertopic_model, bertopic_topics = train_topic_model(train_data_v4, model_type='BERTopic')

# Print the generated BERTopic topics
print("BERTopic Topics:")
print_bertopic_topics(bertopic_model)

Topic Similarity Matrix

In [None]:
bertopic_model.visualize_heatmap()

## Output of Final Topic Model