In [None]:
!pip install pyLDAvis

In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import pyLDAvis.gensim
from gensim import corpora
import matplotlib as plt
from wordcloud import WordCloud

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Define directory path
directory = "D:/Fintech_lab/"
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [5]:
# Making the list of each ticker_year

tickers = ["AXP", "MA", "V"]
for ticker in tickers:
    preprocessed_texts = []
    
    for year in range(2016, 2024):
        for date_format in ["0930", "1231"]:
            filename = os.path.join(directory, f"{ticker}/{year}{date_format}.txt")
            try:
                with open(filename, 'r', encoding='utf-8') as file:
                    text = file.read()
                    
                preprocessed_text = preprocess_text(text)
                preprocessed_texts.append(preprocessed_text)
                list_name = f"{ticker}_{year}"
                globals()[list_name] = preprocessed_text
                break
            except FileNotFoundError:
                continue
    
    # Optionally, If we want to preserve the original file
    # with open(f"{ticker}_preprocessed.txt", 'w', encoding='utf-8') as file:
    #     for text in preprocessed_texts:
    #         file.write(' '.join(text))
    #         file.write('\n')


In [6]:
# Iterate through tickers
tickers = ["AXP", "MA", "V"]
for ticker in tickers:
    for year in range(2016, 2024):
        list_name = f"{ticker}_{year}"
        if list_name in globals():
            print(list_name)
    print('\n')


AXP_2016
AXP_2017
AXP_2018
AXP_2019
AXP_2020
AXP_2021
AXP_2022
AXP_2023


MA_2016
MA_2017
MA_2018
MA_2019
MA_2020
MA_2021
MA_2022
MA_2023


V_2016
V_2017
V_2018
V_2019
V_2020
V_2021
V_2022
V_2023




In [7]:
# Function to apply LDA to a list of preprocessed texts
def apply_lda(texts, num_topics=30):
    # Create a dictionary
    id2word = corpora.Dictionary(texts)
    # Create a corpus
    corpus = [id2word.doc2bow(text) for text in texts]
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=10,
        alpha="auto"
    )
    return lda_model, corpus, id2word
    

In [8]:
# Dictionary to store visualizations
lda_visualizations = {}

# Iterate through tickers
tickers = ["AXP", "MA", "V"]
for ticker in tickers:
    for year in range(2016, 2024):
        list_name = f"{ticker}_{year}"
        
        if list_name in globals():
            lda_model, corpus, id2word = apply_lda([globals()[list_name]])
            vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
            # Store the visualization
            lda_visualizations[list_name] = vis

# Now you can display a visualization later using its ticker and year
# For example, to display the visualization for AXP in 2016:
# pyLDAvis.display(lda_visualizations["AXP_2016"])


In [9]:
pyLDAvis.display(lda_visualizations['AXP_2018'])

In [10]:
# Dictionary to store topics for each ticker and year
lda_topics = {}

# Iterate through tickers
tickers = ["AXP", "MA", "V"]
for ticker in tickers:
    # Initialize a dictionary to store topics for the ticker
    ticker_topics = {}
    for year in range(2016, 2024):
        list_name = f"{ticker}_{year}"
        if list_name in globals():
            lda_model, _, _ = apply_lda([globals()[list_name]])
            # Extract topics from the LDA model
            topics = lda_model.show_topics(num_topics=-1, formatted=False)
            # Store topics for the year
            ticker_topics[year] = topics
    # Store topics for the ticker
    lda_topics[ticker] = ticker_topics

# Now lda_topics dictionary contains topics for each ticker and year
# You can access topics using lda_topics["AXP"][2016], for example


In [11]:
lda_topics['MA'][2019]

[(0,
  [('company', 0.0015087503),
   ('financial', 0.0012208965),
   ('customer', 0.0011386237),
   ('mastercard', 0.0010835003),
   ('2019', 0.0009841486),
   ('may', 0.00096861843),
   ('8217', 0.0009680614),
   ('statement', 0.0009272289),
   ('payment', 0.0009028267),
   ('business', 0.0008987617)]),
 (1,
  [('8217', 0.0014493787),
   ('payment', 0.0013270004),
   ('2019', 0.0013170295),
   ('company', 0.0012932552),
   ('mastercard', 0.0011956897),
   ('customer', 0.0011816636),
   ('business', 0.0010731687),
   ('financial', 0.0010410933),
   ('consolidated', 0.0009908692),
   ('part', 0.0009209711)]),
 (2,
  [('financial', 0.0010395726),
   ('company', 0.001025295),
   ('8217', 0.0009710993),
   ('mastercard', 0.0009587424),
   ('statement', 0.00094292517),
   ('business', 0.0008983128),
   ('customer', 0.00087298936),
   ('could', 0.00083568844),
   ('part', 0.0008356078),
   ('2019', 0.0008330256)]),
 (3,
  [('company', 0.0017093968),
   ('2019', 0.0015519771),
   ('financial

In [None]:
# Iterate through tickers
tickers = ["AXP", "MA", "V"]
for ticker in tickers:
    # Accumulate all text data for the ticker
    all_text_data = []
    for year in range(2016, 2024):
        list_name = f"{ticker}_{year}"
        if list_name in globals():
            all_text_data.extend(globals()[list_name])
    
    # Apply LDA to the accumulated text data
    if all_text_data:
        # Ensure each document is a list of tokenized words
        tokenized_texts = [text.split() for text in all_text_data]
        lda_model, corpus, id2word = apply_lda(tokenized_texts)
        vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="tsne", perplexity=50)
        # Store the visualization
        lda_visualizations[ticker] = vis

# Now you can display a visualization later using its ticker
# For example, to display the visualization for AXP:
# pyLDAvis.display(lda_visualizations["AXP"])

In [None]:
pyLDAvis.display(lda_visualizations['AXP'])

In [None]:
# Dictionary to store LDA topics for each ticker
lda_topics = {}

# Iterate through tickers
tickers = ["AXP", "MA", "V"]
for ticker in tickers:
    # Accumulate all text data for the ticker
    all_text_data = []
    for year in range(2016, 2024):
        list_name = f"{ticker}_{year}"
        if list_name in globals():
            all_text_data.extend(globals()[list_name])
    
    # Apply LDA to the accumulated text data
    if all_text_data:
        # Ensure each document is a list of tokenized words
        tokenized_texts = [text.split() for text in all_text_data]
        lda_model, _, _ = apply_lda(tokenized_texts)
        # Extract topics from the LDA model
        topics = lda_model.show_topics(num_topics=-1, formatted=False)
        # Store topics for the ticker
        lda_topics[ticker] = topics

# Now lda_topics dictionary contains topics for each ticker
# You can access topics using lda_topics["AXP"], for example


In [None]:
lda_topics["MA"]