In [320]:
!pip install nltk gensim



In [321]:
import pandas as pd
import time
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords
from groq import Groq
import os
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
import json
from openai import OpenAI

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oskarroeske/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# EDA

In [322]:
df_reports = pd.read_csv("preprocessed_reports.csv")
df_paragraphs = pd.read_csv("preprocessed_paragraphs.csv")

In [323]:
df_paragraphs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10571 entries, 0 to 10570
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    10571 non-null  int64 
 1   document_id   10571 non-null  int64 
 2   paragraph_id  10571 non-null  int64 
 3   paragraph     10571 non-null  object
dtypes: int64(3), object(1)
memory usage: 330.5+ KB


In [324]:
def clean_text_again(text):
    text = strip_tags(text)
    text = strip_multiple_whitespaces(text)  # Normalize whitespaces
    return text

In [325]:
df_paragraphs["paragraph"] = df_paragraphs["paragraph"].apply(clean_text_again)

# LLM Argument Mining

- Provide List of pre-defined arguments

In [326]:
list_of_arguments = [
    "Growth",
    "Price/Earnings Ratio",
    "Earnings per Share",
    "Cash Flow",
    "Revenue",
    "Return On Equity",
    "Margins",
    "Cost Management",
    "Dividend Policy",
    "Investments",
    "Balance Sheet",
    "Long-term Growth",
    "Mergers and Acquisition",
    "Refranchising",
    "Sustainability",
    "Employees",
    "Research and Development",
    "Marketing",
    "Shares Repurchase",
    "Processes",
    "Leadership",
    "Innovation",
    "Product Characteristics",
    "Pricing Strategy",
    "Production",
    "Technology Trends",
    "Market Share",
    "Market Conditions",
    "Market Expansion",
    "Competition",
    "Global Presence",
    "Industry Outlook",
    "Regulations",
    "Partnerships and Collaborations",
    "Supply Chain",
    "Economic Conditions",
    "Demand",
    "Customers"
]


In [327]:
# Create client
load_dotenv()
api_key = os.getenv("LLAMA_API_KEY")

client = OpenAI(
    api_key = api_key,
    base_url = "https://api.llama-api.com"
    )


def find_arguments(text):
    categories_string = ', '.join(f'"{cat}"' for cat in list_of_arguments)

    system_content = f"""You will analyze a paragraph from a financial analyst report to identify and classify justifications for the companies stock recommendation. Provide only a minimal JSON output summarizing the arguments with categories and sentiment, don't send any other text!
 Guidelines:
    1. Argument Identification:
    - Extract only statements that serve as justifications for the stock recommendation.
    - Ignore brand names, target prices, recommendations, or valuation as arguments.
    - If no argument is found return ONLY an empty JSON
    2. Category Assignment:
    - Read the existing categories carefully.
    - Assign identified arguments to existing categories as much as possible.
    - If the argument could fit into a broader category instead of creating a new one, prefer the broader category (e.g., 'Ad Revenue'/'Mobile Revenue' should map to 'Revenue').
    - Only create a new category, if no similar or broader category is existing
    - A new Category has to be generic and short (MAXIMUM 2 words) and distinct from existing ones.
    3. Sentiment Classification:
    - Label each argument’s sentiment with the words 'positive', 'negative', or 'neutral' depending on the context it is used (financial perspective).
    4. Output:
    - Don't provide any text! 
    - Structure: {{'Category 1': 'negative', 'Category 2': 'positive'}})
    5. Output Examples:
     Example 1 {{'Revenue':'positive','Dividend Policy':'negative'}}.
     Example 2 {{'Investments':'neutral','Leadership':'positive'}}.
   """
    final_prompt = f"""
        Existing Categories:[{categories_string}]
        Text: {text}
       """

    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_content,
            },
            {
                "role": "user",
                "content": final_prompt,
            }
        ],
        temperature=0,
        model="llama3.1-8b",
    )
    print(response)

    return response.choices[0].message.content

In [None]:
import json
from itertools import cycle


def extract_arguments_and_sentiment(commentary):
    global list_of_arguments  # Ensure we can modify the global categories list
    
    # Call the find_arguments function with the selected API key
    result_json = find_arguments(commentary)   
    
    try:
        # Parse the JSON response
        result_dict = json.loads(result_json)
        
        # Check for new categories and add them to the global list
        for category in result_dict.keys():
            if category not in list_of_arguments:
                list_of_arguments.append(category)  # Dynamically update the categories list
        
        time.sleep(2)
        return result_dict  # Return the result
    except json.JSONDecodeError:
        print("Failed to decode JSON response.")
        return None


In [329]:
df_paragraphs["provided_arguments"] = df_paragraphs["paragraph"].apply(extract_arguments_and_sentiment)

ChatCompletion(id=None, choices=[Choice(finish_reason='eos', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "Revenue": "positive",\n  "Growth": "positive",\n  "Market Conditions": "positive",\n  "Investments": "neutral"\n}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1734275318, model='llama3.1-8b', object=None, service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=1077, total_tokens=1141, completion_tokens_details=None, prompt_tokens_details=None))
ChatCompletion(id=None, choices=[Choice(finish_reason='eos', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "Growth": "positive"\n}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1734275326, model='llama3.1-8b', object=None, service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=35, prompt_tokens=612, total_tokens=647, completion

KeyboardInterrupt: 

In [None]:
import pandas as pd
import ast

# Initialize a list to store the exploded rows
rows = []

# Iterate through the DataFrame and expand the dictionary into rows
for index, row in df_paragraphs.iterrows():
    provided_args = row["provided_arguments"]
    # Convert string to dictionary if needed
    if isinstance(provided_args, str):
        try:
            provided_args = ast.literal_eval(provided_args)  # Convert string to dictionary
        except (ValueError, SyntaxError):
            print(f"Invalid format for provided_args at index {index}: {provided_args}")
            continue  # Skip invalid entries

    # Ensure it's a dictionary
    if isinstance(provided_args, dict):
        #print(f"Processed provided_args: {provided_args}")
        for argument, sentiment in provided_args.items():
            rows.append({
                "ID": row["document_id"],
                "Argument": argument,
                "Sentiment": sentiment
            })

# Create a new DataFrame from the exploded rows
df_result = pd.DataFrame(rows)

# Display the resulting DataFrame
print(df_result)

    ID                         Argument Sentiment
0    1                           Growth  positive
1    1             Price/Earnings Ratio   neutral
2    1                     Market Share  positive
3    1                Market Conditions  positive
4    1                Market Conditions  negative
5    1                      Competition  negative
6    1                      Regulations  negative
7    2                          Revenue  positive
8    2                       Ad Revenue  positive
9    2                           Growth  negative
10   2       Expenses as a % of Revenue  positive
11   2                          Revenue  positive
12   2                          Revenue  positive
13   2                           Growth  positive
14   2                Market Conditions  positive
15   2                       Leadership  positive
16   2                      Investments  positive
17   2                        Cash Flow  positive
18   2                        Employees  positive


In [None]:
import pandas as pd
import json
from collections import Counter, defaultdict

# Initialize a dictionary to hold counts for each key and sentiment
sentiment_counts = defaultdict(lambda: Counter({'positive': 0, 'neutral': 0, 'negative': 0}))

# Loop through each entry in the provided_arguments column
for entry in df_paragraphs["provided_arguments"]:
    # Ensure the entry is a dictionary
    if isinstance(entry, str):
        try:
            # Convert string to dictionary if it's in JSON-like format
            arguments_dict = json.loads(entry.replace("'", '"'))  # Replace single quotes with double quotes for JSON
        except json.JSONDecodeError:
            continue  # Skip any rows that can't be parsed as dictionaries
    elif isinstance(entry, dict):
        arguments_dict = entry
    else:
        continue  # Skip if entry is not a dictionary or parseable string

    # Update counts for each key and sentiment
    for key, sentiment in arguments_dict.items():
        sentiment_counts[key][sentiment] += 1

# Convert the result to a DataFrame for easier viewing
sentiment_overview_df = pd.DataFrame(sentiment_counts).T
sentiment_overview_df.columns = ["positive", "neutral", "negative"]
sentiment_overview_df.index.name = "Argument"

# Display the overview DataFrame
sentiment_overview_df.reset_index(inplace=True)
sentiment_overview_df.sort_values(by=["positive","negative","neutral"],ascending=False)


Unnamed: 0,Argument,positive,neutral,negative
6,Revenue,3,0,1
0,Growth,2,0,2
3,Market Conditions,2,0,2
7,Ad Revenue,2,0,0
8,Expenses as a % of Revenue,1,1,0
2,Market Share,1,0,0
9,Leadership,1,0,0
10,Investments,1,0,0
11,Cash Flow,1,0,0
12,Employees,1,0,0
