In [1]:
import os
import pandas as pd
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rushi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# ticker and directory
base_directory = "D:/Fintech_lab"

tickers = ["AXP", "V", "MA"]
years = range(2016, 2024)

In [4]:
# Define start and end markers for risk and management sections for each ticker
markers = {
    "AXP": {
        "risk": [
            {"start": "ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK", "end": "ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA"},
            {"start": "ITEM 7A.", "end": "ITEM 8."}
        ],
        "management": [
            {"start": "ITEM 7. MANAGEMENT&#8217;S", "end": "ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK"},
            {"start": "ITEM 7. ", "end": "ITEM 7A."}   # {"start": "ITEM 7. MANAGEMENT&#146;S", "end": "ITEM 7A."}
        ]
    },
    "V": {
        "risk": [
            {"start": "ITEM 7A. Quantitative and Qualitative Disclosures about Market Risk", "end": "ITEM 8. Financial Statements and Supplementary Data"},
            {'start': "ITEM 7A.", "end" : "ITEM 8."}
        ],
        "management": [
            {"start": "ITEM 7. Management&#8217;s", "end": "ITEM 7A. Quantitative and Qualitative Disclosures about Market Risk"},
            {'start': "ITEM 7.", "end" : "ITEM 7A."}
        ]
    },
    "MA": {
        "risk": [
            {"start": "ITEM 7A. &#32;&#32;", "end": "ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA"},
            {"start": "Item 7A. Quantitative and qualitative disclosures about market risk", "end": "ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA"}
        ],
        "management": [
            {"start": "ITEM 7. MANAGEMENT&#8217", "end": "ITEM 7A. &#32;&#32;"},
            {"start": "Item 7. Management&#8217;s", "end" : "Item 7A. Quantitative and qualitative disclosures about market risk"}
        ]
    }
}

In [5]:
# Function to extract risk-related sections from text
def extract_risk_sections(text, ticker):
    for section_marker in markers[ticker]["risk"]:
        risk_sections = extract_section(text, section_marker["start"], section_marker["end"])
        if risk_sections:
            return risk_sections
    return ""

# Function to extract management-related sections from text
def extract_management_sections(text, ticker):
    for section_marker in markers[ticker]["management"]:
        management_sections = extract_section(text, section_marker["start"], section_marker["end"])
        if management_sections:
            return management_sections
    return ""

In [6]:
# Initialize NLTK's WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Define stopwords set and punctuation set
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

# Function to extract sections between start and end markers
def extract_section(text, start_marker, end_marker):
    start_index = text.find(start_marker)
    end_index = text.find(end_marker, start_index)
    if start_index != -1 and end_index != -1:
        return text[start_index:end_index]
    else:
        return ""

In [7]:
# Create a dictionary to store risk-related information
risk_data = defaultdict(list)

# Loop through tickers and years
for ticker in tickers:
    for year in years:
        # Read text file for 1231
        file_path_1231 = os.path.join(base_directory, f"{ticker}/{year}1231.txt")
        if os.path.exists(file_path_1231):
            with open(file_path_1231, "r", encoding="utf-8") as file:
                text = file.read()
        else:
            # Read text file for 0930
            file_path_0930 = os.path.join(base_directory, f"{ticker}/{year}0930.txt")
            if os.path.exists(file_path_0930):
                with open(file_path_0930, "r", encoding="utf-8") as file:
                    text = file.read()
            else:
                continue
        
        risk_sections = extract_risk_sections(text, ticker)
        management_sections = extract_management_sections(text, ticker)

        preprocessed_risk_sections = preprocess_text(risk_sections)
        preprocessed_management_sections = preprocess_text(management_sections)
        
        # Append data to risk_data dictionary
        risk_data["Ticker"].append(ticker)
        risk_data["Year"].append(year)
        risk_data["Risk Sections"].append(preprocessed_risk_sections)
        risk_data["Management Sections"].append(preprocessed_management_sections)


In [8]:
# Convert risk data to DataFrame
risk_df = pd.DataFrame(risk_data)

# Compare risk and management sections across companies
risk_comparison = risk_df.pivot_table(index='Year', columns='Ticker', values=['Risk Sections', 'Management Sections'], aggfunc='first')
risk_comparison

Unnamed: 0_level_0,Management Sections,Management Sections,Management Sections,Risk Sections,Risk Sections,Risk Sections
Ticker,AXP,MA,V,AXP,MA,V
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2016,item 7. management 146 discussion analysis fin...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a 32 32 quantitative qualitative disclos...,item 7a quantitative qualitative disclosure ma...
2017,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a 32 32 quantitative qualitative disclos...,item 7a quantitative qualitative disclosure ma...
2018,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a 32 32 quantitative qualitative disclos...,item 7a quantitative qualitative disclosure ma...
2019,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...
2020,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...
2021,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...
2022,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...
2023,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7. management 8217 discussion analysis fi...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...,item 7a quantitative qualitative disclosure ma...


In [9]:
# Extracting from the dataframe
ticker = "V"    # MA, AXP, and V
Year = 2017     # from 2016 to 2023
section = ['Risk Sections', 'Management Sections']

text = risk_comparison.at[Year, (section[1], ticker)]
print(f"{section[1]} for {ticker} in {Year}:", text)

Management Sections for V in 2017: item 7. management 8217 discussion analysis financial condition result operation table_end management 8217 discussion analysis provides review result operation financial condition liquidity capital resource visa inc. subsidiary 8220 visa 8221 8220 8221 `` u '' 8220 8221 8220 company 8221 historical basis outline factor affected recent earnings well factor may affect future earnings following discussion analysis read conjunction consolidated financial statement related note included item 8 report overview visa global payment technology company enables fast secure reliable electronic payment across 200 country territory facilitate global commerce transfer value information among global network consumer merchant financial institution business strategic partner government entity advanced transaction processing network visanet enables authorization clearing settlement payment transaction allows u provide financial institution merchant client wide range pro