In [262]:
!pip install nltk gensim openai



In [263]:
import pandas as pd
import time
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords
from groq import Groq
import os
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oskarroeske/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



# EDA

In [264]:
df_reports = pd.read_csv("preprocessed_reports.csv")
df_paragraphs = pd.read_csv("preprocessed_paragraphs.csv")

In [265]:
df_paragraphs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12573 entries, 0 to 12572
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    12573 non-null  int64 
 1   filename      12573 non-null  object
 2   document_id   12573 non-null  int64 
 3   paragraph_id  12573 non-null  int64 
 4   paragraph     12573 non-null  object
dtypes: int64(3), object(2)
memory usage: 491.3+ KB


In [266]:
df_paragraphs

Unnamed: 0.1,Unnamed: 0,filename,document_id,paragraph_id,paragraph
0,0,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,1,"Last night, ahead of the 2014 CES, NVDA hosted..."
1,1,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,2,GameStream works hand in hand with the PC gami...
2,2,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,3,NVDA sought to bridge the gap between PC and m...
3,3,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,4,Recognizing the shift to advanced automobile f...
4,4,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,5,NVDA has spent heavily on its mobile applicati...
...,...,...,...,...,...
12568,12568,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,1,Even though available at quite at a lag with t...
12569,12569,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,2,"Coming back to the October data, International..."
12570,12570,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,3,International shipments Apple till October hav...
12571,12571,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,4,5G mix accounted for 80% of total shipments in...


In [267]:
def clean_text_again(text):
    text = strip_tags(text)
    text = strip_multiple_whitespaces(text)  # Normalize whitespaces
    return text

In [268]:
df_paragraphs["paragraph"] = df_paragraphs["paragraph"].apply(clean_text_again)

# LLM Argument Mining

In [269]:
list_of_arguments = [
    "Growth",
    "Price/Earnings Ratio",
    "Earnings per Share",
    "Cash Flow",
    "Revenue",
    "Return On Equity",
    "Margins",
    "Cost Management",
    "Dividend Policy",
    "Investments",
    "Balance Sheet",
    "Long-term Growth",
    "Mergers and Acquisition",
    "Refranchising",
    "Sustainability",
    "Employees",
    "Research and Development",
    "Marketing",
    "Shares Repurchase",
    "Processes",
    "Leadership",
    "Innovation",
    "Product Characteristics",
    "Pricing Strategy",
    "Production",
    "Technology Trends",
    "Market Share",
    "Market Conditions",
    "Market Expansion",
    "Competition",
    "Global Presence",
    "Industry Outlook",
    "Regulations",
    "Partnerships and Collaborations",
    "Supply Chain",
    "Economic Conditions",
    "Demand",
    "Customer"
]


In [270]:
len(list_of_arguments)

38

In [271]:
from openai import OpenAI
import json

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

def find_arguments(text):
    
    amount_of_categories = len(list_of_arguments)

    categories_string = ', '.join(f'"{cat}"' for cat in list_of_arguments)

    system_content = f"""You will analyze a paragraph from a financial analyst report to identify justifications for the companies stock recommendation. Provide a minimal JSON output summarizing the arguments with the existing categories and sentiment. Take two seconds to handle the task.
 Guidelines:
    1. Argument Identification:
    - Extract only statements that serve as justifications for the stock recommendation.
    - Ignore brand names, target prices, recommendations, or valuation as arguments.
    - If no argument is found return an empty JSON
    2. Category Assignment:
    - Read the existing categories very carefully.
    - **If {amount_of_categories} is 38 or more:**
        - **Assign argument to the best-fitting existing category, even if imperfect!**
        - Do NOT create a new category.
    - **If {amount_of_categories} < 38, you may only add a new category if it is absolutely necessary and cannot fit into an existing category. If you create a new category, it must be short and generic (max 2 words). **
    - Violating the category limit or adding more categories than allowed is not permitted.
    3. Sentiment Classification:
    - Label each argument’s sentiment as 'positive', 'negative', or 'neutral' depending on the context it is used (financial perspective).
    4. Output:
    - Return JSON (Structure) {{'Category 1': 'negative', 'Category 2': 'positive'}}
    5. Example:{{\n    'Market Share': 'positive',\n    'Revenue': "positive'\n}}.
    Important: If you fail to comply with these category limits, your response is invalid. Do not produce more categories than allowed under any circumstances.
   """
    user_content = f"""
        Existing Categories:[{categories_string}]
        Text: {text}
        Amount of Categories: {amount_of_categories} (MAXIMUM!: 38)

        Remember:
        - If there are already 38 or more categories, you must assign an existing one.
       """
    #print(user_content)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[
            {
                "role": "system", 
                "content":system_content
            },
            {
                "role": "user", 
                "content":user_content
            }
        ],
        temperature=0,
        response_format={
        "type": "json_object"
        }
    )
    print(response)
    return response.choices[0].message.content

In [272]:
def extract_arguments_and_sentiment(commentary):
    global list_of_arguments  # Ensure we can modify the global categories list
    
    # Call the find_arguments function with the selected API key
    result_json = find_arguments(commentary)   
    
    try:
        # Parse the JSON response
        result_dict = json.loads(result_json)
        
        # Check for new categories and add them to the global list
        for category in result_dict.keys():
            if category not in list_of_arguments:
                list_of_arguments.append(category)  # Dynamically update the categories list
        
        time.sleep(1)
        return result_dict  # Return the result
    except json.JSONDecodeError:
        print("Failed to decode JSON response.")
        return None

In [273]:
df_paragraphs = df_paragraphs[50:150]

In [274]:
df_paragraphs

Unnamed: 0.1,Unnamed: 0,filename,document_id,paragraph_id,paragraph
50,50,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,11,We have made minor adjustments to our fiscal 2...
51,51,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,12,We have made initial projections for PGs fisca...
52,52,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,13,PGs dividends have risen for 57 consecutive ye...
53,53,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,14,PG shares have underperformed some common benc...
54,54,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,15,"Given PGs earnings history, we believe a price..."
...,...,...,...,...,...
145,145,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,25,AAPLs operating results could be affected badl...
146,146,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,26,Inventory and Other Asset Risk: Product obsole...
147,147,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,27,Key Components: AAPL procures its key componen...
148,148,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,28,Dependence on Third Parties Outside the US: AA...


In [275]:
df_paragraphs["provided_arguments"] = df_paragraphs["paragraph"].apply(extract_arguments_and_sentiment)

ChatCompletion(id='chatcmpl-AgTebqzsWJXb61wQnfOdN4Wwyr7P8', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n    "Growth": "positive",\n    "Margins": "positive",\n    "Earnings per Share": "positive",\n    "Shares Repurchase": "positive"\n}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1734687129, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=37, prompt_tokens=682, total_tokens=719, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))
ChatCompletion(id='chatcmpl-AgTedxehwVCWLl2e6piFS9gP4r82U', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n    "Growth": "positive",\n   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paragraphs["provided_arguments"] = df_paragraphs["paragraph"].apply(extract_arguments_and_sentiment)


In [276]:
len(list_of_arguments)

41

In [277]:
list_of_arguments

['Growth',
 'Price/Earnings Ratio',
 'Earnings per Share',
 'Cash Flow',
 'Revenue',
 'Return On Equity',
 'Margins',
 'Cost Management',
 'Dividend Policy',
 'Investments',
 'Balance Sheet',
 'Long-term Growth',
 'Mergers and Acquisition',
 'Refranchising',
 'Sustainability',
 'Employees',
 'Research and Development',
 'Marketing',
 'Shares Repurchase',
 'Processes',
 'Leadership',
 'Innovation',
 'Product Characteristics',
 'Pricing Strategy',
 'Production',
 'Technology Trends',
 'Market Share',
 'Market Conditions',
 'Market Expansion',
 'Competition',
 'Global Presence',
 'Industry Outlook',
 'Regulations',
 'Partnerships and Collaborations',
 'Supply Chain',
 'Economic Conditions',
 'Demand',
 'Customer',
 'Valuation',
 'Legal Issues',
 'Risk Management']

In [278]:
list_of_arguments

['Growth',
 'Price/Earnings Ratio',
 'Earnings per Share',
 'Cash Flow',
 'Revenue',
 'Return On Equity',
 'Margins',
 'Cost Management',
 'Dividend Policy',
 'Investments',
 'Balance Sheet',
 'Long-term Growth',
 'Mergers and Acquisition',
 'Refranchising',
 'Sustainability',
 'Employees',
 'Research and Development',
 'Marketing',
 'Shares Repurchase',
 'Processes',
 'Leadership',
 'Innovation',
 'Product Characteristics',
 'Pricing Strategy',
 'Production',
 'Technology Trends',
 'Market Share',
 'Market Conditions',
 'Market Expansion',
 'Competition',
 'Global Presence',
 'Industry Outlook',
 'Regulations',
 'Partnerships and Collaborations',
 'Supply Chain',
 'Economic Conditions',
 'Demand',
 'Customer',
 'Valuation',
 'Legal Issues',
 'Risk Management']

In [279]:
df_paragraphs

Unnamed: 0.1,Unnamed: 0,filename,document_id,paragraph_id,paragraph,provided_arguments
50,50,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,11,We have made minor adjustments to our fiscal 2...,"{'Growth': 'positive', 'Margins': 'positive', ..."
51,51,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,12,We have made initial projections for PGs fisca...,"{'Growth': 'positive', 'Margins': 'positive', ..."
52,52,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,13,PGs dividends have risen for 57 consecutive ye...,{'Dividend Policy': 'positive'}
53,53,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,14,PG shares have underperformed some common benc...,"{'Growth': 'negative', 'Earnings per Share': '..."
54,54,20140124_Hilliard_Lyons_PG_Report_received_in_...,574,15,"Given PGs earnings history, we believe a price...",{'Price/Earnings Ratio': 'neutral'}
...,...,...,...,...,...,...
145,145,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,25,AAPLs operating results could be affected badl...,"{'Competition': 'negative', 'Market Conditions..."
146,146,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,26,Inventory and Other Asset Risk: Product obsole...,{'Risk Management': 'negative'}
147,147,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,27,Key Components: AAPL procures its key componen...,{'Supply Chain': 'negative'}
148,148,20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf,362,28,Dependence on Third Parties Outside the US: AA...,{'Supply Chain': 'negative'}


In [280]:
df_test_document_merge = df_paragraphs

In [281]:
# Merge arguments back to one document and get rid of paragraph_id and column
#df_test_document_merge.to_csv("report_arguments_3.5_v3.csv")

In [282]:
import pandas as pd
import ast

# Initialize a list to store the exploded rows
rows = []

# Iterate through the DataFrame and expand the dictionary into rows
for index, row in df_paragraphs.iterrows():
    provided_args = row["provided_arguments"]
    # Convert string to dictionary if needed
    if isinstance(provided_args, str):
        try:
            provided_args = ast.literal_eval(provided_args)  # Convert string to dictionary
        except (ValueError, SyntaxError):
            print(f"Invalid format for provided_args at index {index}: {provided_args}")
            continue  # Skip invalid entries

    # Ensure it's a dictionary
    if isinstance(provided_args, dict):
        #print(f"Processed provided_args: {provided_args}")
        for argument, sentiment in provided_args.items():
            rows.append({
                "ID": row["document_id"],
                "Argument": argument,
                "Sentiment": sentiment
            })

# Create a new DataFrame from the exploded rows
df_result = pd.DataFrame(rows)

# Display the resulting DataFrame
print(df_result)

      ID            Argument Sentiment
0    574              Growth  positive
1    574             Margins  positive
2    574  Earnings per Share  positive
3    574   Shares Repurchase  positive
4    574              Growth  positive
..   ...                 ...       ...
166  362   Market Conditions  negative
167  362     Risk Management  negative
168  362        Supply Chain  negative
169  362        Supply Chain  negative
170  406             Margins  negative

[171 rows x 3 columns]


In [283]:
#df_result.to_csv("gpt-3.5._v2.csv")

In [284]:
#df_result_testing = pd.read_csv("gpt-4o-mini-results.csv")

In [285]:
#df_result_testing.drop(columns=["Unnamed: 0.1","Unnamed: 0"])

In [286]:
import pandas as pd
import json
from collections import Counter, defaultdict

# Initialize a dictionary to hold counts for each key and sentiment
sentiment_counts = defaultdict(lambda: Counter({'positive': 0, 'neutral': 0, 'negative': 0}))

# Loop through each entry in the provided_arguments column
for entry in df_paragraphs["provided_arguments"]:
    # Ensure the entry is a dictionary
    if isinstance(entry, str):
        try:
            # Convert string to dictionary if it's in JSON-like format
            arguments_dict = json.loads(entry.replace("'", '"'))  # Replace single quotes with double quotes for JSON
        except json.JSONDecodeError:
            continue  # Skip any rows that can't be parsed as dictionaries
    elif isinstance(entry, dict):
        arguments_dict = entry
    else:
        continue  # Skip if entry is not a dictionary or parseable string

    # Update counts for each key and sentiment
    for key, sentiment in arguments_dict.items():
        sentiment_counts[key][sentiment] += 1

# Convert the result to a DataFrame for easier viewing
sentiment_overview_df = pd.DataFrame(sentiment_counts).T
sentiment_overview_df.columns = ["positive", "neutral", "negative"]
sentiment_overview_df.index.name = "Argument"

# Display the overview DataFrame
sentiment_overview_df.reset_index(inplace=True)
sentiment_overview_df.sort_values(by=["positive","negative","neutral"],ascending=False)
#sentiment_overview_df["positive"] = sentiment_overview_df["positive"].apply(lambda x: x.int())


Unnamed: 0,Argument,positive,neutral,negative
18,Revenue,18,6,4
0,Growth,13,0,5
2,Earnings per Share,8,7,3
7,Market Share,7,1,1
1,Margins,5,4,4
21,Cash Flow,5,0,0
3,Shares Repurchase,4,0,0
19,Technology Trends,3,0,2
6,Investments,3,1,1
20,Balance Sheet,2,4,0


In [311]:
sentiment_overview_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1588 entries, 0 to 1587
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Argument  1588 non-null   object 
 1   positive  1588 non-null   float64
 2   neutral   1588 non-null   float64
 3   negative  1588 non-null   float64
 4   Error     2 non-null      float64
dtypes: float64(4), object(1)
memory usage: 62.2+ KB


In [313]:
sentiment_overview_df = sentiment_overview_df.drop(columns="Error")

In [None]:
#sentiment_overview_df.to_csv("sentiment_overview_3.5-turbo_v2.csv")

from openai import OpenAI
import json

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

def find_arguments(text):
    
    amount_of_categories = len(list_of_arguments)

    categories_string = ', '.join(f'"{cat}"' for cat in list_of_arguments)

    system_content = f"""You will analyze a paragraph from a financial analyst report to identify justifications for the companies stock recommendation. Provide a minimal JSON output summarizing the arguments with the existing categories and sentiment.
 Guidelines:
    1. Argument Identification:
    - Extract only statements that serve as justifications for the stock recommendation.
    - Ignore brand names, target prices, recommendations, or valuation as arguments.
    - If no argument is found return an empty JSON
    2. Category Assignment:
    - Read the existing categories very carefully.
    - **If {amount_of_categories} is 38 or more:**
        - **Assign argument to an existing category, even if imperfect!**
        - Do NOT create a new category.
    - **If {amount_of_categories} is less than 38:**
        - Assign identified arguments to existing categories as much as possible.
        - If the argument could fit into a broader category instead of creating a new one, prefer the broader category (e.g., 'Ad Revenue'/'Mobile Revenue' should map to 'Revenue'.
        ** If you create a new Category, it must be generic and short (maximum 2 words).**
    3. Sentiment Classification:
    - Label each argument’s sentiment as 'positive', 'negative', or 'neutral' depending on the context it is used (financial perspective).
    4. Output:
    - Return a JSON (Structure: {{'Category 1': 'negative', 'Category 2': 'positive'}}).
    5. Example:
    - {{'Revenue':'positive','Dividend Policy':'negative'}}.
    Important: If you fail to comply with these category limits, your response is invalid. 
                Do not produce more categories than allowed under any circumstances.
   """
    user_content = f"""
        Existing Categories:[{categories_string}]
        Text: {text}
        Amount of Categories: {amount_of_categories} (MAXIMUM!: 38)
       """
    #print(user_content)
    response = client.chat.completions.create(
        model="gpt-4o-2024-11-20",
        messages=[
            {
                "role": "system", 
                "content":[
                    {
                        "type":"text",
                        "text":system_content
                    }
                ] 
            },
            {
                "role": "user", 
                "content":[
                    {
                        "type":"text",
                        "text":user_content
                    }
                ] 
            }
        ],
        temperature=0,
        response_format={
        "type": "json_object"
        }
    )
    #print(response)
    return response.choices[0].message.content