# NLP - Assignment 2

## Imports

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from bertopic import BERTopic
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from FuzzyTM import FLSA_W

## Preprocess Data

In [3]:
path = "./us_equities_news_dataset.csv"

# Load the news dataset
news_dataset = pd.read_csv("./us_equities_news_dataset.csv")
news_dataset.head()

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249
3,221518,NIO,NIO NVAX among premarket gainers,news,Cemtrex NASDAQ CETX 85 after FY results \n...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039
4,221519,NIO,PLUG NIO among premarket gainers,news,aTyr Pharma NASDAQ LIFE 63 on Kyorin Pharm...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096


In [4]:
# Filter dataset to only include articles with 'Nvidia' in the content
nvidia_dataset = news_dataset[news_dataset['content'].str.contains('Nvidia', case=False, na=False)]


In [5]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text, 
                    remove_punctuation=True, 
                    remove_stopwords=True, 
                    lemmatize=False, 
                    stem=False, 
                    remove_short_words=False, 
                    remove_rare_words=False, 
                    remove_numbers=True, 
                    min_word_length=2):
    """
    Advanced preprocessing function that applies different levels of text processing.
    
    Parameters:
    - text: The text to preprocess.
    - remove_punctuation: Whether to remove punctuation from the text.
    - remove_stopwords: Whether to remove common stopwords.
    - lemmatize: Whether to apply lemmatization to reduce words to their root form.
    - stem: Whether to apply stemming to reduce words to their base form.
    - remove_short_words: Whether to remove short words from the text.
    - remove_rare_words: Whether to remove rare words based on the dataset distribution.
    - remove_numbers: Whether to remove numbers from the text.
    - min_word_length: The minimum length of words to keep in the text.
    
    Returns:
    - Preprocessed text as tokens.
    """
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Remove URLs and HTML tags
    tokens = [re.sub(r'http\S+|www\S+|<.*?>', '', token) for token in tokens]
    
    # Remove numbers if specified
    if remove_numbers:
        tokens = [re.sub(r'\d+', '', token) for token in tokens]
    
    # Remove non-alphabetic characters (punctuation)
    if remove_punctuation:
        tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]
    
    # Remove stopwords
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    if lemmatize:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Stemming (alternative to lemmatization)
    if stem:
        tokens = [stemmer.stem(token) for token in tokens]
    
    # Remove short words
    if remove_short_words:
        tokens = [token for token in tokens if len(token) >= min_word_length]
    
    return tokens

In [6]:
def apply_preprocessing(nvidia_dataset, version='v1'):
    """
    Apply different levels of preprocessing to the dataset.
    
    Parameters:
    - nvidia_dataset: The Nvidia articles with a 'content' column.
    - version: The version of preprocessing to apply ('v1', 'v2', 'v3', or 'v4').
    
    Returns:
    - DataFrame with the original content and the preprocessed content in 'preprocessed_content' column.
    """
    
    if version == 'v1':
        # Basic tokenization and lowercasing
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=False, remove_stopwords=False, 
            lemmatize=False, remove_numbers=False, stem=False, 
            remove_short_words=False
        ))
    
    elif version == 'v2':
        # Remove punctuation, stopwords, and numbers, but no lemmatization/stemming
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=True, remove_stopwords=True, 
            lemmatize=False, remove_numbers=True, stem=False, 
            remove_short_words=False
        ))
    
    elif version == 'v3':
        # Advanced preprocessing with stemming, number removal, short words removal
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=True, remove_stopwords=True, 
            lemmatize=False, stem=True, remove_numbers=True, 
            remove_short_words=True, min_word_length=2
        ))
    
    elif version == 'v4':
        # Full preprocessing with lemmatization instead of stemming, number removal, and short words removal
        nvidia_dataset['preprocessed_content'] = nvidia_dataset['content'].apply(lambda x: preprocess_text(
            x, remove_punctuation=True, remove_stopwords=True, 
            lemmatize=True, remove_numbers=True, stem=False, 
            remove_short_words=True, min_word_length=2
        ))
    
    else:
        raise ValueError("Invalid preprocessing version. Choose from 'v1', 'v2', 'v3', or 'v4'.")
    
    # Return the DataFrame with original and preprocessed content
    return nvidia_dataset


In [7]:
train_data_v1 = apply_preprocessing(nvidia_dataset, version='v1')  # Basic preprocessing
train_data_v2 = apply_preprocessing(nvidia_dataset, version='v2')  # Intermediate preprocessing
train_data_v3 = apply_preprocessing(nvidia_dataset, version='v3')  # Full preprocessing with stemming
train_data_v4 = apply_preprocessing(nvidia_dataset, version='v4')  # Full preprocessing with lemmatizer

In [8]:
train_data_v1.head()

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,preprocessed_content
24,221539,NIO,A Central Bank War Just Started And Its Good F...,opinion,ECB Effects\nThe move in the euro was huge fa...,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687,"[ecb, effect, move, euro, huge, falling, pip, ..."
32,221547,NIO,6 Stocks To Watch Nivida Could Be Falling,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931,"[stock, watch, march, trading, session, stock,..."
57,221572,NIO,Stocks Dow Drops Nearly 400 Points as Apple ...,news,Investing com A rout in Apple and Facebook ...,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042,"[investing, com, rout, apple, facebook, nasdaq..."
78,221593,UBER,The Zacks Analyst Blog Highlights Advanced Mi...,opinion,For Immediate ReleaseChicago IL January 13 ...,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277,"[immediate, releasechicago, il, january, zacks..."
82,221597,UBER,The Best Of CES 2020 Revised,opinion,With 4 500 companies bringing their innovation...,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164,"[company, bringing, innovation, ce, jan, get, ..."


In [9]:
# train_data_head = train_data_v1.head()

# # Specify the filename for the Excel file
# output_file = 'train_data_v1_head.xlsx'

# # Save to Excel
# train_data_head.to_excel(output_file, index=False)

## Train Topic Models

In [17]:
def train_topic_model(train_data, model_type='LDA', num_topics=10):
    """
    Train a topic model on the given training data.

    Parameters:
    - train_data: list of str, the text to train the model on
    - model_type: str, the type of model to train ('LDA', 'FLSA-W', 'BERTopic')
    - num_topics: int, the number of topics to generate

    Returns:
    - model: the trained model
    - topics: the topics generated by the model
    """

    train_data_list = [' '.join(tokens) for tokens in train_data['preprocessed_content']]

    if model_type == 'LDA':
        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
        data_vectorized = vectorizer.fit_transform(train_data_list)
        
        # Train LDA model
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
        lda_model.fit(data_vectorized)
        
        # Get topics (top words in each topic)
        def get_lda_topics(model, vectorizer, n_top_words=10):
            feature_names = vectorizer.get_feature_names_out()
            topics = []
            for topic_idx, topic in enumerate(model.components_):
                top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
                topics.append(top_words)
            return topics
        
        topics = get_lda_topics(lda_model, vectorizer)
        
        return lda_model, topics
    
    elif model_type == 'FLSA-W':
        # Replace with actual FLSA-W training code
        flsa_w_model = None
        topics = None
        
        return flsa_w_model, topics
    
    elif model_type == 'BERTopic':
        # Train BERTopic model
        topic_model = BERTopic(nr_topics=num_topics)
        topics, _ = topic_model.fit_transform(train_data_list)
        
        return topic_model, topics
    
    else:
        raise ValueError("Invalid model_type. Choose from 'LDA', 'FLSA-W', 'BERTopic'.")

## Iteration 1

In [12]:
def print_topics_lda(topics):
    for idx, topic in enumerate(topics):
        print(f"Topic {idx + 1}: {', '.join(topic)}")

In [18]:
def print_topics_flsaw(topics):
    for idx, topic in enumerate(topics):
        print(f"Topic {idx + 1}: {', '.join(topic)}")

In [14]:
def print_bertopic_topics(topic_model):
    """
    Print topics generated by BERTopic.
    
    Parameters:
    - topic_model: the trained BERTopic model
    """
    topics = topic_model.get_topics()
    for topic_num, words in topics.items():
        # Ignore the '-1' topic, which is typically noise in BERTopic
        if topic_num == -1:
            continue
        print(f"Topic {topic_num}: {', '.join([word[0] for word in words])}")


### LDA

In [67]:
# Train LDA  model on the first version of the preprocessed dataset
LDA_model, LDA_topics = train_topic_model(train_data_v1, model_type='LDA', num_topics=10)

In [68]:
print("Initial LDA Topics:")
print_topics_lda(LDA_topics)

Initial LDA Topics:
Topic 1: company, apple, nasdaq, technology, google, market, new, amazon, year, service
Topic 2: market, week, year, earnings, index, rate, investor, stock, price, month
Topic 3: year, quarter, company, zacks, revenue, earnings, million, estimate, stock, growth
Topic 4: driving, car, company, vehicle, self, technology, autonomous, said, bitcoin, nasdaq
Topic 5: etf, fund, semiconductor, zacks, technology, nasdaq, index, read, portfolio, company
Topic 6: stock, zacks, earnings, year, investment, company, growth, industry, market, investor
Topic 7: nasdaq, nyse, stock, trade, share, rose, percent, fell, point, china
Topic 8: nasdaq, stock, day, nyse, trading, nvda, week, market, support, short
Topic 9: stock, market, year, like, investor, time, new, week, share, think
Topic 10: amd, intel, gaming, chip, year, market, analyst, company, revenue, nasdaq


### FLSA-W

In [None]:
# Train FLSA-W model on the first version of the preprocessed dataset

# Print the generated FLSA-W topics


### BERTopic

In [69]:
# Train BERTopic model on the first version of the preprocessed dataset
# num_topics is maximum number of topics rather than a fixed number of topics
bertopic_model, bertopic_topics = train_topic_model(train_data_v1, model_type='BERTopic', num_topics=10)


In [54]:
# Print the generated BERTopic topics
print("BERTopic Topics:")
print_bertopic_topics(bertopic_model)

BERTopic Topics:
Topic 0: year, company, zacks, quarter, stock, revenue, earnings, nasdaq, million, growth
Topic 1: nasdaq, stock, nyse, week, market, trade, index, day, inc, year
Topic 2: nvidia, year, gaming, revenue, company, quarter, share, nvda, billion, nasdaq
Topic 3: driving, car, vehicle, self, autonomous, technology, company, ai, system, nvidia
Topic 4: traded, nvidia, day, seven, lowest, gmt, highest, exchange, volume, session
Topic 5: earnings, growth, estimate, zacks, stock, eps, period, company, investment, quarter
Topic 6: huawei, said, qualcomm, china, company, chinese, technology, patent, commission, chip
Topic 7: facebook, user, ad, company, platform, video, zacks, nasdaq, rank, twitter
Topic 8: rallied, climbed, soared, australian, gained, italy, market, dollar, step, towards


## Iteration 2

## Iteration 3

## Iteration ....

## Evaluation

### Qualitative

### Quantitative

## Output of Final Topic Model