In [660]:
!pip install nltk gensim openai



In [674]:
import pandas as pd
import time
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords
from groq import Groq
import os
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oskarroeske/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# EDA

In [686]:
df_reports = pd.read_csv("preprocessed_reports.csv")
df_paragraphs = pd.read_csv("preprocessed_paragraphs.csv")

In [687]:
df_paragraphs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    916 non-null    int64 
 1   filename      916 non-null    object
 2   document_id   916 non-null    int64 
 3   paragraph_id  916 non-null    int64 
 4   paragraph     916 non-null    object
dtypes: int64(3), object(2)
memory usage: 35.9+ KB


In [688]:
def clean_text_again(text):
    text = strip_tags(text)
    text = strip_multiple_whitespaces(text)  # Normalize whitespaces
    return text

In [678]:
df_paragraphs["paragraph"] = df_paragraphs["paragraph"].apply(clean_text_again)

# LLM Argument Mining

In [679]:
list_of_arguments = [
    "Growth",
    "Price/Earnings Ratio",
    "Earnings per Share",
    "Cash Flow",
    "Revenue",
    "Return On Equity",
    "Margins",
    "Cost Management",
    "Dividend Policy",
    "Investments",
    "Balance Sheet",
    "Long-term Growth",
    "Mergers and Acquisition",
    "Refranchising",
    "Sustainability",
    "Employees",
    "Research and Development",
    "Marketing",
    "Shares Repurchase",
    "Processes",
    "Leadership",
    "Innovation",
    "Product Characteristics",
    "Pricing Strategy",
    "Production",
    "Technology Trends",
    "Market Share",
    "Market Conditions",
    "Market Expansion",
    "Competition",
    "Global Presence",
    "Industry Outlook",
    "Regulations",
    "Partnerships and Collaborations",
    "Supply Chain",
    "Economic Conditions",
    "Demand",
    "Customers"
]


In [680]:
len(list_of_arguments)

38

In [681]:
from openai import OpenAI
import json

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

def find_arguments(text):
    
    amount_of_categories = len(list_of_arguments)

    categories_string = ', '.join(f'"{cat}"' for cat in list_of_arguments)

    system_content = f"""You will analyze a paragraph from a financial analyst report to identify and classify justifications for the companies stock recommendation. Provide a minimal JSON output summarizing the arguments with categories and sentiment.
 Guidelines:
    1. Argument Identification:
    - Extract only statements that serve as justifications for the stock recommendation.
    - Ignore brand names, target prices, recommendations, or valuation as arguments.
    - If no argument is found return an empty JSON
    2. Category Assignment:
    - Read the existing categories carefully.
    - **If {amount_of_categories} is 40 or more:**
        - Do NOT create a new category.
        - Assign argument to an existing category, even if imperfect
    - **If {amount_of_categories} is less than 40:**
        - Assign identified arguments to existing categories as much as possible.
        - If the argument could fit into a broader category instead of creating a new one, prefer the broader category (e.g., 'Ad Revenue'/'Mobile Revenue' should map to 'Revenue').
        - Create a new category ONLY if no broader category is existing
        - A new Category MUST be generic and short (MAXIMUM 2 words) and distinct from existing ones.
    3. Sentiment Classification:
    - Label each argument’s sentiment as 'positive', 'negative', or 'neutral' depending on the context it is used (financial perspective).
    4. Output:
    - Return a JSON (Structure: {{'Category 1': 'negative', 'Category 2': 'positive'}}).
    5. Examples:
     Example 1 {{'Revenue':'positive','Dividend Policy':'negative'}}.
     Example 2 {{'Investments':'neutral','Leadership':'positive'}}.
   """
    user_content = f"""
        Existing Categories:[{categories_string}]
        Text: {text}
        Amount of Categories: {amount_of_categories} (Maximum: 40)
       """
    print(user_content)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system", 
                "content":[
                    {
                        "type":"text",
                        "text":system_content
                    }
                ] 
            },
            {
                "role": "user", 
                "content":[
                    {
                        "type":"text",
                        "text":user_content
                    }
                ] 
            }
        ],
        temperature=0,
        response_format={
        "type": "json_object"
        }
    )
    #print(response)
    return response.choices[0].message.content

In [682]:
df_paragraphs=df_paragraphs[0:50]

In [683]:
df_paragraphs

Unnamed: 0.1,Unnamed: 0,filename,document_id,paragraph_id,paragraph
0,0,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,1,"Last night, ahead of the 2014 CES, NVDA hosted..."
1,1,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,2,GameStream works hand in hand with the PC gami...
2,2,20140116_Brean_Capital_SIG_SIG-_Some_Pyrite_Mi...,909,1,We are reducing our 4QFY14 and FY15 estimates ...
3,3,20140116_Brean_Capital_SIG_SIG-_Some_Pyrite_Mi...,909,2,Holiday November-December comp of 5% easily be...
4,4,20140116_Brean_Capital_URBN_URBN-_Ups_-_Downs-...,717,1,We are reiterating our Hold rating and estimat...
5,5,20140116_Brean_Capital_URBN_URBN-_Ups_-_Downs-...,717,2,Holiday comp of 3% was below the Streets expec...
6,6,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,1,"200 Park Ave., West Mezzanine New York, NY 101..."
7,7,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,2,Investors will need to consider the following ...
8,8,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,3,Much of online advertising is highly competiti...
9,9,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,4,SMEs have been the core . We believe they have...


In [684]:
def extract_arguments_and_sentiment(commentary):
    global list_of_arguments  # Ensure we can modify the global categories list
    
    # Call the find_arguments function with the selected API key
    result_json = find_arguments(commentary)   
    
    try:
        # Parse the JSON response
        result_dict = json.loads(result_json)
        
        # Check for new categories and add them to the global list
        for category in result_dict.keys():
            if category not in list_of_arguments:
                list_of_arguments.append(category)  # Dynamically update the categories list
        
        time.sleep(2)
        return result_dict  # Return the result
    except json.JSONDecodeError:
        print("Failed to decode JSON response.")
        return None


In [685]:
df_paragraphs["provided_arguments"] = df_paragraphs["paragraph"].apply(extract_arguments_and_sentiment)


        Existing Categories:["Growth", "Price/Earnings Ratio", "Earnings per Share", "Cash Flow", "Revenue", "Return On Equity", "Margins", "Cost Management", "Dividend Policy", "Investments", "Balance Sheet", "Long-term Growth", "Mergers and Acquisition", "Refranchising", "Sustainability", "Employees", "Research and Development", "Marketing", "Shares Repurchase", "Processes", "Leadership", "Innovation", "Product Characteristics", "Pricing Strategy", "Production", "Technology Trends", "Market Share", "Market Conditions", "Market Expansion", "Competition", "Global Presence", "Industry Outlook", "Regulations", "Partnerships and Collaborations", "Supply Chain", "Economic Conditions", "Demand", "Customers"]
        Text: Last night, ahead of the 2014 CES, NVDA hosted a press conference where CEO Jen-Hsen Huang took the cover off of several next-generation Consumer Semiconductors products and platforms. The press conference, thematically centered around Games, Chips and Cars, left little d

In [562]:
gpt_4o_mini_list_of_arguments

['Growth',
 'Price/Earnings Ratio',
 'Earnings per Share',
 'Cash Flow',
 'Revenue',
 'Return On Equity',
 'Margins',
 'Cost Management',
 'Dividend Policy',
 'Investments',
 'Balance Sheet',
 'Long-term Growth',
 'Mergers and Acquisition',
 'Refranchising',
 'Sustainability',
 'Employees',
 'Research and Development',
 'Marketing',
 'Shares Repurchase',
 'Processes',
 'Leadership',
 'Innovation',
 'Product Characteristics',
 'Pricing Strategy',
 'Production',
 'Technology Trends',
 'Market Share',
 'Market Conditions',
 'Market Expansion',
 'Competition',
 'Global Presence',
 'Industry Outlook',
 'Regulations',
 'Partnerships and Collaborations',
 'Supply Chain',
 'Economic Conditions',
 'Demand',
 'Customers',
 'User Engagement',
 'Ad Revenue',
 'Expenses',
 'Debt Management',
 'Share Buyback',
 'Risk',
 'Advertiser Strength',
 'Collaboration',
 'Quality',
 'Costs',
 'Pricing',
 'Profitability',
 'Risks',
 'Working Capital',
 'Inventory',
 'Prepayments',
 'Market Positioning',
 'Core

In [547]:
df_paragraphs

Unnamed: 0.1,Unnamed: 0,document_id,paragraph_id,paragraph,provided_arguments
0,0,1,1,: $150.00 We raise our estimates for 3Q16 as w...,"{'Revenue': 'positive', 'Growth': 'positive', ..."
1,1,1,2,Our $150 target price embeds a 10-year OIBDA g...,{}
2,2,1,3,"FB is the largest social network, with 1.65B u...","{'Market Share': 'positive', 'User Engagement'..."
3,3,1,4,Risks to our thesis and target price include a...,"{'Market Conditions': 'negative', 'Regulations..."
4,4,2,1,FB reported relatively strong 2Q resultsrevenu...,"{'Revenue': 'positive', 'Growth': 'negative', ..."
...,...,...,...,...,...
10566,10566,923,2,Investors will need to consider the following ...,{}
10567,10567,923,3,Much of online advertising is highly competiti...,"{'Competition': 'negative', 'Investments': 'po..."
10568,10568,923,4,SMEs have been the core . We believe they have...,"{'Revenue': 'positive', 'Market Conditions': '..."
10569,10569,923,5,A looming threat for all web publishers relate...,"{'Regulations': 'negative', 'Revenue': 'negati..."


In [553]:
import pandas as pd
import ast

# Initialize a list to store the exploded rows
rows = []

# Iterate through the DataFrame and expand the dictionary into rows
for index, row in df_paragraphs.iterrows():
    provided_args = row["provided_arguments"]
    # Convert string to dictionary if needed
    if isinstance(provided_args, str):
        try:
            provided_args = ast.literal_eval(provided_args)  # Convert string to dictionary
        except (ValueError, SyntaxError):
            print(f"Invalid format for provided_args at index {index}: {provided_args}")
            continue  # Skip invalid entries

    # Ensure it's a dictionary
    if isinstance(provided_args, dict):
        #print(f"Processed provided_args: {provided_args}")
        for argument, sentiment in provided_args.items():
            rows.append({
                "ID": row["document_id"],
                "Argument": argument,
                "Sentiment": sentiment
            })

# Create a new DataFrame from the exploded rows
df_result = pd.DataFrame(rows)

# Display the resulting DataFrame
print(df_result)

        ID           Argument Sentiment
0        1            Revenue  positive
1        1             Growth  positive
2        1        Investments  positive
3        1  Market Conditions  positive
4        1        Competition  positive
...    ...                ...       ...
24549  923            Revenue  positive
24550  923  Market Conditions  negative
24551  923        Regulations  negative
24552  923            Revenue  negative
24553  923        Regulations  negative

[24554 rows x 3 columns]


In [555]:
df_result

Unnamed: 0,ID,Argument,Sentiment
0,1,Revenue,positive
1,1,Growth,positive
2,1,Investments,positive
3,1,Market Conditions,positive
4,1,Competition,positive
...,...,...,...
24549,923,Revenue,positive
24550,923,Market Conditions,negative
24551,923,Regulations,negative
24552,923,Revenue,negative


In [None]:
import pandas as pd
import json
from collections import Counter, defaultdict

# Initialize a dictionary to hold counts for each key and sentiment
sentiment_counts = defaultdict(lambda: Counter({'positive': 0, 'neutral': 0, 'negative': 0}))

# Loop through each entry in the provided_arguments column
for entry in df_paragraphs["provided_arguments"]:
    # Ensure the entry is a dictionary
    if isinstance(entry, str):
        try:
            # Convert string to dictionary if it's in JSON-like format
            arguments_dict = json.loads(entry.replace("'", '"'))  # Replace single quotes with double quotes for JSON
        except json.JSONDecodeError:
            continue  # Skip any rows that can't be parsed as dictionaries
    elif isinstance(entry, dict):
        arguments_dict = entry
    else:
        continue  # Skip if entry is not a dictionary or parseable string

    # Update counts for each key and sentiment
    for key, sentiment in arguments_dict.items():
        sentiment_counts[key][sentiment] += 1

# Convert the result to a DataFrame for easier viewing
sentiment_overview_df = pd.DataFrame(sentiment_counts).T
sentiment_overview_df.columns = ["positive", "neutral", "negative"]
sentiment_overview_df.index.name = "Argument"

# Display the overview DataFrame
sentiment_overview_df.reset_index(inplace=True)
sentiment_overview_df.sort_values(by=["positive","negative","neutral"],ascending=False)


ValueError: Length mismatch: Expected axis has 1276 elements, new values have 3 elements