In [40]:
!pip install ace_tools spacy tabula https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m


In [41]:
# Packages
import os
import re
import pdfplumber
import numpy
import pandas as pd
import spacy
from datetime import datetime
from dateutil.parser import parse
import ast

In [42]:
# Load Relevant Files and models
nlp = spacy.load("en_core_web_sm")
provider_info = pd.read_csv('provider_info.csv')
company_info = pd.read_csv('company_info.csv')

company_info = company_info.drop_duplicates(subset='Ticker Symbol')

# Create a dictionary to map ticker symbols to company name and industry
ticker_map = company_info.set_index('Ticker Symbol')[['Company Name', 'Industry']].to_dict(orient='index')

In [43]:
"""# Only relevant to check font type and size for
import pdfplumber

def extract_words_with_formatting(page):

    # Extract words with their bounding boxes
    words = page.extract_words(extra_attrs=["fontname", "size"])

    formatted_words = []
    for word in words:
        formatted_words.append({
            "word": word["text"],
            "font": word.get("fontname", "Unknown"),
            "size": word.get("size", "Unknown"),
            "x0": word["x0"],
            "x1": word["x1"],
            "top": word["top"],
            "bottom": word["bottom"]
        })
    return formatted_words


# Example usage with pdfplumber
pdf_path = "/Users/oskarroeske/Desktop/Analyst_Reports/Production/20140116_Brean_Capital_SIG_SIG-_Some_Pyrite_Mixed_In_Among_the_Diamonds-_Tweaking.pdf"

with pdfplumber.open(pdf_path) as pdf:
    for page_number, page in enumerate(pdf.pages, start=1):
        print(f"Page {page_number}:")
        formatted_words = extract_words_with_formatting(page)
        for word_info in formatted_words:
            print(
                f"Word: '{word_info['word']}', Font: {word_info['font']}, Size: {word_info['size']}, "
                f"Position: ({word_info['x0']}, {word_info['top']} - {word_info['x1']}, {word_info['bottom']})"
            )"""

'# Only relevant to check font type and size for\nimport pdfplumber\n\ndef extract_words_with_formatting(page):\n\n    # Extract words with their bounding boxes\n    words = page.extract_words(extra_attrs=["fontname", "size"])\n\n    formatted_words = []\n    for word in words:\n        formatted_words.append({\n            "word": word["text"],\n            "font": word.get("fontname", "Unknown"),\n            "size": word.get("size", "Unknown"),\n            "x0": word["x0"],\n            "x1": word["x1"],\n            "top": word["top"],\n            "bottom": word["bottom"]\n        })\n    return formatted_words\n\n\n# Example usage with pdfplumber\npdf_path = "/Users/oskarroeske/Desktop/Analyst_Reports/Production/20140116_Brean_Capital_SIG_SIG-_Some_Pyrite_Mixed_In_Among_the_Diamonds-_Tweaking.pdf"\n\nwith pdfplumber.open(pdf_path) as pdf:\n    for page_number, page in enumerate(pdf.pages, start=1):\n        print(f"Page {page_number}:")\n        formatted_words = extract_words_w

In [44]:
# define patterns per provider for target price, ending (disclosures), rating, font type and size

patterns = {
    "APP Securities Pty Ltd": {
        "price_patterns": {
            "primary": r"TARGET PRICE (NA|A\$(\d+(\.\d+)?))",
        },
        "rating_patterns": {
            "primary": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
        },
        "ending_patterns": [r"Analyst Certification"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]Calibri(,Bold)?", "font_size": 8.04},
            {"font_type": r"[A-Z]+[+]Calibri(,Bold)?", "font_size": 8.25},
            {"font_type": r"[A-Z]+[+]Calibri(,Bold)?", "font_size": 9.00},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Alliance Global Partners": {
        "price_patterns": {
            "primary": r"Price Target (NA|\$(\d+(\.\d+)?))",
        },
        "rating_patterns": {
            "primary": r"\b(Buy|Hold|Sell|Overweight|Underperform)\b",
        },
        "ending_patterns": [r"Imporant Research Disclosures"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]ArialMT(-BoldMT)?", "font_size": 8.00},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Barclays": {
        "price_patterns": {
            "primary": r"Price Target USD (\d+(\.\d+)?)",
            "secondary": r"Price Target: USD (\d+(\.\d+)?)",
        },
        "rating_patterns": {
            "primary": r"Stock Rating ([A-Za-z]+)",
            "secondary": r"\b(BUY|HOLD|SELL|OVERWEIGHT|OVERPERFORM|UNDERPERFORM|UNDERWEIGHT|NEUTRAL)\b",
            "tertiary": r"\b(Buy|Hold|Sell|Overweight|Overperform|Underperform|Underweight|Neutral)\b",
        },
        "ending_patterns": [r"ANALYST\(S\) CERTIFICATION\(S\)",
                            r"Analyst\(s\) Certification\(s\)"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]Expert Sans (Extra Bold)?(Regular)?(Regular,Bold)?", "font_size": 9.0},
            {"font_type": r"[A-Z]+[+]Expert Sans (Extra Bold)?(Regular)?", "font_size": 8.04},
            {"font_type": r"[A-Z]+[+]SourceSansPro(-Bold)?(-Regular)?", "font_size": 9.00},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 9.01},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 7.58},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 7.52},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 8.52},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 8.75},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 7.60},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 7.11},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 6.52},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 7.02},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 7.03},
        ]
    },
    "BGC Partners": {
        "price_patterns": {
            "primary": r"Price Target \(\$\) (\d+(\.\d+)?)",
        },
        "rating_patterns": {
            "primary": r"([A-Za-z]+) \(\w+,",
        },
        "ending_patterns": [r"Disclosures Appendix"],
        "font_patterns": [
            {"font_type": r"Tahoma(-Bold)?", "font_size": 7.92},
            {"font_type": r"Tahoma(-Bold)?", "font_size": 10.08},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Brean Capital": {
        "price_patterns": {
            "primary": r"PT: \$ (\d+(\.\d+)?)",
            "secondary": r"TP: \$(\d+(\.\d+)?)",

        },
        "rating_patterns": {
            "primary": r"\b(Buy|Hold|Sell|Overweight|Underperform)\b"
        },
        "ending_patterns": [r"Analyst Certification",r"Important Disclosures "],
        "font_patterns": [
            {"font_type": r"Tahoma(-Bold)?", "font_size": 7.92},
            {"font_type": r"[A-Z]+[+]Calibri(-Bold)?(-Italic)?", "font_size": 8.00},
            {"font_type": r"[A-Z]+[+]Calibri(-Bold)?(-Italic)?", "font_size": 9.00},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "BTIG": {
        "price_patterns": {
            "primary": r"\$(\d+(\.\d+)?) 12 month target ",
        },
        "rating_patterns": {
            "primary": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
        },
        "ending_patterns": [r"Appendix: Analyst Certification and Other Important Disclosures"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]Corbel(,Bold)?(,-Italic)?", "font_size": 9.96},
            {"font_type": r"[A-Z]+[+]Corbel(,Bold)?(,-Italic)?", "font_size": 9.00},
            {"font_type": r"[A-Z]+[+]Calibri(,Bold)?(,Italic)?", "font_size": 9.96},
            {"font_type": r"[A-Z]+[+]Calibri(-Bold)?(-Italic)?", "font_size": 10.0},
            {"font_type": r"[A-Z]+[+]Calibri(,Bold)?(,Italic)?", "font_size": 9.00},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Cascend Securities -Historical-": {
        "price_patterns": {
            "primary": r"Price target: \$(\d+(\.\d+)?)",
        },
        "rating_patterns": {
            "primary": r"Rating: ([A-Za-z]+)",
        },
        "ending_patterns": [r"Disclosures: "],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]Calibri(,Bold)?", "font_size": 12.0},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Deutsche Bank": {
        "price_patterns": {
            "primary": r"Price Target \(USD\) (\d+(\.\d+)?)",
            "secondary": r"Price target (\d+(\.\d+)?)",
        },
        "rating_patterns": {
            "primary": r"\b(Buy|Hold|Sell|Overweight|Underperform|Underweight|Neutral)\b",
            #"secondary": r"Rating ([A-Za-z]+)",
        },
        "ending_patterns": [r"Appendix 1"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]UniversDeutscheBank-Regular", "font_size": 9.0},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "FinTrust Investment Advisors": {
        "price_patterns": {
            "primary": r"Target Price: \$(\d+(,\d+)?)",
        },
        "rating_patterns": {
            "primary": r"Fintrust Rating: ([A-Za-z]+)",
        },
        "ending_patterns": [r"Important Disclosures:"],
        "font_patterns": [
            {"font_type": r"Arial(-BoldMT)?", "font_size": 7.92},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Gilford Securities Inc": {
        "price_patterns": {
            "primary": r"\, \$(\d+(\.\d+)?)",
        },
        "rating_patterns": {
            "primary": r"Rated: ([A-Za-z]+)",
            "secondary": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
            "tertiary": r"\b(Buy|Hold|Sell|Overweight|Underperform|Underweight|Neutral)\b",
        },
        "ending_patterns": [r"ANALYST CERTIFICATION",r"REQUIRED DISCLOSURES"],
        "font_patterns": [
            {"font_type":  r"Arial(MT)?(-BoldMT)?", "font_size": 10.02},
            {"font_type":  r"Arial(MT)?(-BoldMT)?", "font_size": 10.01},
            {"font_type": "Not Available", "font_size": 10.00},
        ]
    },
    "Hilliard Lyons": {
        "price_patterns": {
            "primary": r"Price Target (NA|\$(\d+(\.\d+)?))",
        },
        "rating_patterns": {
            "primary": r"-- NYSE\s+[–\-—]+\s+([A-Za-z]+)\s+[–\-—]+",
            "secondary":r"NYSE\s+[–\-—]+\s+([A-Za-z\- ]+?)(?=\s*[-–—]\d)"
        },
        "ending_patterns": [r"Analyst Certification I,"],
        "font_patterns": [
            {"font_type": r"Verdana(-Bold)?(-Italic)?", "font_size": 9.00},
            {"font_type": r"Verdana(-Bold)?(-Italic)?", "font_size": 9.00},
            {"font_type":r"TimesNewRomanPS(-BoldMT)?", "font_size": 10.98},
            {"font_type":r"Times(-Bold)?", "font_size": 10.98},
            ]
    },
    "IBI Investment House": {
        "price_patterns": {
            "primary": r"Price target: \$(\d+(,\d+)?)",
        },
        "rating_patterns": {
            "primary": r"Recommendation: ([A-Za-z]+)",
        },
        "ending_patterns": [r"Disclosures"],
        "font_patterns": [
            {"font_type": r"Tahoma(-Bold)?", "font_size": 7.92},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "IFS Securities": {
        "price_patterns": {
            "primary": r"Price Target: \$(\d+(,\d+)?)",
        },
        "rating_patterns": {
            "primary": r"(Buy|Hold|Sell|Overweight|Underperform|Outperform)",
        },
        "ending_patterns": [r"Important Investor Disclosures"],
        "font_patterns": [
            {"font_type": r"Arial(-BoldMT)?", "font_size": 11.04},
            {"font_type":r"TimesNewRomanPS(MT)?(-BoldMT)?", "font_size": 11.04},
        ]
    },
    "JP Morgan": {
        "price_patterns": {
            "primary": r"Price Target: \$(\d+(\.\d+)?)",
            "secondary": r"Price Target \([A-Za-z0-9\-]+\): \$(\d+(\.\d+)?)"
        },
        "rating_patterns": {
            "primary": r"\b(Buy|Hold|Sell|Overweight|Underperform|Underweight|Neutral)\b",
        },
        "ending_patterns": [r"Analyst Certification:","Important Disclosures"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]TimesNewRoman(,Bold)?(,Italic)?", "font_size": 9.60},
            {"font_type": r"[A-Z]+[+]TimesNewRoman(,Bold)?(,Italic)?", "font_size": 10.08},
            {"font_type": r"[A-Z]+[+]TimesNewRomanPS(MT)?(-BoldMT)?(-ItalicMT)?", "font_size": 11.04},
            {"font_type": r"[A-Z]+[+]TimesNewRomanPS(MT)?(-BoldMT)?(-ItalicMT)?", "font_size": 10.00},
            {"font_type": r"[A-Z]+[+]TimesNewRoman(,Bold)?(,Italic)?", "font_size": 10.56},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Mizuho Securities": {
        "price_patterns": {
            "primary": r"Price Target \$(\d+(\.\d+)?)",
        },
        "rating_patterns": {
            "primary": r"Rating ([A-Za-z]+)",
        },
        "ending_patterns": [r"IMPORTANT DISCLOSURES"],
        "font_patterns": [
            {"font_type": r"Tahoma(-Bold)?", "font_size": 7.92},
            {"font_type": r"Times-Roman(-Bold)?(-Italic)?(-BoldItalic)?", "font_size": 10.50},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Needham": {
        "price_patterns": {
            "primary": r"Price Target: \$(\d+(\.\d+)?)",
            "secondary": r"PRICE TARGET: \$(\d+(\.\d+)?)",
            "tertiary": r"Price Target \$(\d+(\.\d+)?)",

        },
        "rating_patterns": {
            "primary": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
            "secondary": r"Rating (\w+)",
        },
        "ending_patterns": [r"Analyst Certification"
                            r"ANALYST CERTIFICATION"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]Cambria(-Bold)?(-Regular)?", "font_size": 10.0},
            {"font_type": r"[A-Z]+[+]Cambria(-Bold)?(-Regular)?", "font_size": 9.96},
            {"font_type": r"[A-Z]+[+]KeplerStd(-Bold)?", "font_size": 9.0},
        ]
    },
    "Phillip Securities": {
        "price_patterns": {
            "primary": r"TARGET PRICE USD (\d+(\.\d+)?)",
            "secondary": r"U\s?S\s?D\s?(\d+(?:\s?\.\s?\d+)?)\s?T\s?A\s?R\s?G\s?E\s?T\s?P\s?R\s?I\s?C\s?E",
        },
        "rating_patterns": {
            "primary": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
            "secondary": r"\b(B U Y|H O L D|S E L L|O V E R W E I G H T|U N D E R P E R F O R M| N E U T R A L | O U T P E R F O R M)\b",
        },
        "ending_patterns": [r"Contact Information"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]Calibri(-Bold)?", "font_size": 9.96},
            {"font_type": r"[A-Z]+[+]Calibri(-Bold)?", "font_size": 10.0},
            {"font_type": r"[A-Z]+[+]Calibri(-Bold)?", "font_size": 9.99},
        ]
    },
    "Pivotal Research Group": {
        "price_patterns": {
            "primary": r"Target Price: \$(\d+(,\d+)?)",
        },
        "rating_patterns": {
            "primary": r"RATING: ([A-Za-z]+)",
        },
        "ending_patterns": [r"Appendix: Important Disclosures"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]Helvetica(-Bold)?", "font_size": 9.96},
            {"font_type": r"[A-Z]+[+]Helvetica(-Bold)?", "font_size": 9.96},
            {"font_type": r"[A-Z]+[+]Helvetica(-Bold)?", "font_size": 9.00},
            {"font_type": r"[A-Z]+[+]Helvetica(-Bold)?", "font_size": 8.99},
            {"font_type": r"Arial(MT)?(-BoldMT)?", "font_size": 8.04},
            {"font_type": r"Arial(MT)?(-BoldMT)?", "font_size": 8.03},
            {"font_type": r"Arial(MT)?(-BoldMT)?", "font_size": 9.00},
            {"font_type": r"[A-Z]+[+]Arial", "font_size": 9.00},
        ]
    },
    "Spartan Capital Securities LLC": {
        "price_patterns": {
            "primary": r"T \$(\d+(\.\d+)?)",
        },
        "rating_patterns": {
            "primary": r"([A-Za-z]+)",
        },
        "ending_patterns": [r"Important Disclosures"],
        "font_patterns": [
            {"font_type": r"Tahoma(-Bold)?", "font_size": 7.92},
            {"font_type": "Not Available", "font_size": 10.0},
        ]
    },
    "Wells Fargo": {
        "price_patterns": {
            "primary": r"\/Price Target: \$(\d+(\.\d+)?)",
            "secondary": r"Price Target\/Prior: \$(\d+(\.\d+)?)",
            #"tertiary": r"\/\$(\d+(\.\d+)?)"
        },
        "rating_patterns": {
            "primary": r"\/\b(Buy|Hold|Sell|Overweight|Underperform|Underweight|Neutral)\b",
            "secondary": r"Rating (\w+)",
            "tertiary": r"([A-Za-z]+)/\$",
        },
        "ending_patterns": [r"Required Disclosures"],
        "font_patterns": [
            {"font_type": r"[A-Z]+[+]WellsFargoSans(-Light)?(,-SemiBold)?", "font_size": 9.00},
            {"font_type": r"[A-Z]+[+]Verdana(-Bold)?", "font_size": 8.04},
            {"font_type": r"[A-Z]+[+]Verdana(-Bold)?", "font_size": 9.88},
            {"font_type": r"[A-Z]+[+]DejaVuSans(-Bold)?", "font_size": 9.01}
        ]
    }
}



In [45]:
def check_validity(paragraph):
    # Parse the paragraph
    paragraph = re.sub(r"\s+", " ", paragraph).strip()
    doc = nlp(paragraph)
        
    for sent in doc.sents:
        has_verb = False
        has_subject = False
        
        for token in sent:
            # Check for a verb
            if token.pos_ in {"VERB", "AUX"}:
                has_verb = True
            # Check for a subject
            if token.dep_ in {"nsubj", "nsubjpass"}:
                has_subject = True
        
        # If both a verb and a subject are found, the sentence is valid
        if has_verb and has_subject:
            return True
        
        # At least one word with 5+ letters and all upper case -> probably a headline (will also be included)
        if re.search(r"[A-Z]{5,}", paragraph):
            return True
    return False

def filter_valid_paragraphs(paragraphs):
    valid_paragraphs = []

    for paragraph in paragraphs:
        if check_validity(paragraph):
            valid_paragraphs.append(paragraph)
    return valid_paragraphs

# Function to check if a word ends a sentence
def is_sentence_end(word):
    text = word["text"]
    next_text = word.get("next_text", "")

    # Sentences definitely end with ! or ?, for . has to be checked further
    if text.endswith(".") or text.endswith("!") or text.endswith("?"):
        # Ensure it's not part of a decimal number
        if next_text:
            if text.replace(".", "").isdigit() and next_text.isdigit():
                return False
            # Check if the next word starts with an uppercase letter (for ".")
            if text.endswith(".") and next_text and not next_text.istitle():
                return False
            return True
    return False

def extract_text_with_format(page, provider, page_number,url_date):

    if isinstance(url_date, datetime):
        url_date = url_date.date()

    date = None

    # Access patterns
    provider_patterns = patterns[provider]
    price_patterns = provider_patterns["price_patterns"]
    rating_patterns = provider_patterns["rating_patterns"]
    ending_patterns = provider_patterns["ending_patterns"]
    font_patterns = provider_patterns["font_patterns"]

    # Extract words with font and size details
    words = page.extract_words(extra_attrs=["fontname", "size"])

    # Handle empty words
    if not words:
        return [], False, None, None, date

    # Round text sizes to 2 decimal places -> for calculation
    for word in words:
        if "size" in word and word["size"] is not None:
            word["size"] = round(word["size"], 2)

    # Sort words by vertical and horizontal position
    words.sort(key=lambda w: (w["top"], w["x0"]))

    paragraphs = []
    current_paragraph = []
    current_top = None
    lookahead_buffer = []
    rating = None
    price = None
    extracted_date = None

    # Add next word context for sentence-ending logic
    for i in range(len(words) - 1):
        words[i]["next_text"] = words[i + 1]["text"]
    words[-1]["next_text"] = None 


    # Check all words of the document
    for word in words:
        
        # Build lookahead buffer
        lookahead_buffer.append(word["text"])
        if len(lookahead_buffer) > 35:
            lookahead_buffer.pop(0)
        buffer_text = " ".join(lookahead_buffer)

        # Extract rating or price (only for page 1) -> more efficient, because rating etc. not relevant
        if page_number == 1:
            # Extract rating
            if not rating:
                for pattern_key in ["primary", "secondary", "tertiary"]:
                    pattern = rating_patterns.get(pattern_key)
                    if pattern:
                        rating_match = re.search(pattern, buffer_text)
                        if rating_match:
                            rating = rating_match.group(1)
                            break

            # Extract price
            if not price:
                for pattern_key in ["primary", "secondary", "tertiary"]:
                    pattern = price_patterns.get(pattern_key)
                    if pattern:
                        price_match = re.search(pattern, buffer_text)
                        if price_match:
                            price = price_match.group(1)
                            break

            # Look for date entities in the text
            if len(buffer_text.split()) > 30:
                if date is None:
                    sequence = nlp(buffer_text)
                    for ent in sequence.ents:
                        if ent.label_ == "DATE":
                            try:
                                candidate_date = parse(ent.text, fuzzy=True).date()
                        
                                # Check if date is date is valid
                                if datetime.min.date() <= candidate_date <= datetime.max.date():
                                    extracted_date = candidate_date
                                    break  # Stop if date is found

                            except (ValueError, OverflowError):
                                continue  # Skip invalid dates
                    
                    # Calculate the difference
                    date_difference = 0
                    if extracted_date:
                        date_difference = (url_date - extracted_date).days
                        
                    # Check the conditions (should only be taken if within 10 days)
                    if extracted_date and (0 <= date_difference <= 10):
                        date = extracted_date
                        
        # Check for ending pattern
        for ending_pattern in ending_patterns:
            if re.search(ending_pattern, buffer_text):
                return filter_valid_paragraphs(paragraphs), True, rating, price, date
        
        # Match word against font patterns, include only if both font and type are matched
        is_font_matched = False
        for font_pattern in font_patterns:
            font_type = font_pattern["font_type"]
            font_size = font_pattern["font_size"]
            if re.match(font_type, word["fontname"]) and word["size"] == font_size:
                is_font_matched = True
                break

        if not is_font_matched:
            continue

        # Check if we need to start a new paragraph, vertical distance relevant
        if current_top is not None and abs(word["top"] - current_top) >= 13:
            if word['text'] and word['text'][0].islower():
                # add word to current paragraph
                current_paragraph.append(word["text"])
            else:
                #end paragraph and start a new one
                paragraphs.append(" ".join(current_paragraph))
                current_paragraph = [word["text"]]
        else:
            # Continue the current paragraph
            current_paragraph.append(word["text"])


        current_top = word["top"]

    #Add paragraph to list of paragraphs
    if current_paragraph:
        paragraphs.append(" ".join(current_paragraph))

    return filter_valid_paragraphs(paragraphs), False, rating, price, date


def extract_metadata(filename, ticker_map):
    
    # Extract the date (first 8 digits in the filename)
    date_match = re.match(r"(\d{8})", filename)
    if not date_match:
        return None, None, None, None, None
    date_str = date_match.group(1)
    date = datetime.strptime(date_str, "%Y%m%d")

    # Look for the ticker in the filename
    for ticker in ticker_map.keys():
        ticker_pattern = f"_{ticker}_"  # Ensure ticker is surrounded by underscores
        if ticker_pattern in filename:
            # Extract the portion between date and ticker as the provider
            provider_section = filename.split(f"{date_str}_")[1].split(f"_{ticker}_")[0]
            provider = provider_section.replace('_', ' ')  # Replace underscores with spaces
            # Get company name and industry from the ticker_map
            company_name = ticker_map[ticker]['Company Name']
            industry = ticker_map[ticker]['Industry']
            return date, provider, ticker, company_name, industry

    # If no ticker is found, return None for ticker-related fields
    return date, None, None, None, None


In [46]:
# Define function to process all PDFs in the directory and store data in DataFrame
def extract_text_from_all_pdfs_to_dataframe(directory_path, ticker_map):
    data = []
    id_counter = 1  # Initialize an ID counter
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):  # Process only PDF files
            file_path = os.path.join(directory_path, filename)

            # Extract metadata from filename
            url_date, provider, ticker, company_name, industry = extract_metadata(filename,ticker_map=ticker_map)
            all_paragraphs = []  
            first_rating = None
            first_price = None
            first_date = None
            stop_extraction = False

            with pdfplumber.open(file_path) as pdf:
                for page_number, page in enumerate(pdf.pages, start=1):
                    if stop_extraction:
                        break  # Exit the loop if stop_extraction is set
            
                    # Extract data from the current page
                    paragraphs, stop_extraction, rating, price,new_date = extract_text_with_format(page, provider=provider,page_number=page_number,url_date=url_date)
                    
                    # Append paragraphs from the current page
                    all_paragraphs.extend(paragraphs)

                    # Capture the first non-None rating and price
                    if rating is not None and first_rating is None:
                        first_rating = rating
                    if price is not None and first_price is None:
                        first_price = price

                    if new_date is not None and first_date is None:
                        first_date = new_date
                        
            if first_date is None:
                first_date = url_date.date()
                        
            # Add extracted data to the list
            data.append({
                "document_id": id_counter,  # Unique ID
                "filename": filename,
                "date": first_date,
                "provider": provider,
                "ticker": ticker,
                "company_name": company_name,
                "industry": industry,
                "paragraphs": all_paragraphs,
                "target_price":first_price.replace(" ","") if first_price else None,
                "rating": first_rating.lower().replace(" ","") if first_rating else None
                })
            id_counter += 1  # Increment the ID counter for the next row

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    return df

# Directory path
pdf_directory = "/Users/oskarroeske/Desktop/Analyst_Reports/Production"

# Run the function and store results in a DataFrame
df_reports = extract_text_from_all_pdfs_to_dataframe(pdf_directory, ticker_map)



# Cleaning of paragraphs

In [47]:
import re

def clean_paragraph(paragraph):
    # Remove email addresses
    paragraph = re.sub(r"\S+@\S+", "", paragraph)
    
    # Remove phone numbers
    paragraph = re.sub(r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}(?:\s?(?:ext|x|ext.)\s?\d{1,5})?\b", "", paragraph)
    
    # Remove URLs
    paragraph = re.sub(r"http\S+|www\S+", "", paragraph)
    
    # REmove special characters (keep relevant sign for financial documents)
    paragraph = re.sub(r"[^/\w\s,.&!?%$:-]", "", paragraph)
    
    # Remove multiple spaces or newlines
    paragraph = re.sub(r"\s+", " ", paragraph).strip()

    #Check Headers that are in line with the other texts
    paragraph = re.sub(r"\b(?:[A-Z]{2,}\s+){2,}[A-Z]{2,}\b", "", paragraph)
    
    # Remove short paragraphs (fewer than 6 words) -> probably not an actual paragraph (subjectively set)
    if len(paragraph.split()) < 6:
        return None
    
    return paragraph

# Clean paragaphs
df_reports['paragraphs'] = df_reports['paragraphs'].apply(
    lambda paragraphs: [clean_paragraph(p) for p in paragraphs if clean_paragraph(p)]
)

# Drop rows where the `paragraphs` column is empty after cleaning
df_reports = df_reports[df_reports['paragraphs'].str.len() > 0]

# Convert rating to string
df_reports["rating"] = df_reports["rating"].astype(str).apply(lambda x: x.lower() if x != "nan" else None)

# Get Date and Price for Dates

In [48]:
# Load already prepared data from yahoo finance
df_performance_data = pd.read_csv("performance_data.csv")

# Convert 'Date' column to datetime and set it as index (with timezone awareness)
df_performance_data['Date'] = pd.to_datetime(df_performance_data['Date'], utc=True)
df_performance_data = df_performance_data.set_index('Date')

In [49]:
df_saved_reports = df_reports

In [50]:
def get_stock_prices(ticker, start_date, end_date=None):
    try:
        # Ensure start_date and end_date are in the same timezone (UTC)
        start_date = pd.to_datetime(start_date, utc=True)
        if end_date is not None:
            end_date = pd.to_datetime(end_date, utc=True)
        else:
            end_date = start_date  # If no end_date, use start_date for a single day

        # Filter the data for the ticker and date range
        filtered_data = df_performance_data.loc[
            (df_performance_data.index >= start_date) & 
            (df_performance_data.index <= end_date), ticker]

        if not filtered_data.empty:
            # Return the maximum & minimum price within the filtered date range -> used for accuracy for buy/sell recommendations
            max_price = filtered_data.max()
            min_price = filtered_data.min()
            return round(float(max_price),2), round(float(min_price),2)
        else:
            # If no data available in the range, find the next available date, asof selects the next available date, after the start date
            next_available_data = df_performance_data[ticker].asof(start_date)
            if next_available_data is not None:
                return round(float(next_available_data),2), round(float(next_available_data),2)
            else:
                return round(float('nan')), round(float('nan'))  # Return NaN for missing data

    except KeyError:
        return float('nan'), float('nan')  # Return NaN for missing data
    except Exception as e:
        return float('nan'), float('nan')  # Return NaN for missing data

def calculate_prices(row):
    short_name = row['ticker']
    base_date = pd.to_datetime(row['date'], utc=True)

    # Calculate prices
    row['start price'] = get_stock_prices(short_name, base_date)[0]
    row['one day after'] = get_stock_prices(short_name, base_date + pd.DateOffset(days=1))[0]
    row['max price after 3 months'], row["min price after 3 months"] = get_stock_prices(
        short_name, base_date, base_date + pd.DateOffset(months=3)
    )
    row['max price after 6 months'], row['min price after 6 months'] = get_stock_prices(
        short_name, base_date + pd.DateOffset(months=3), base_date + pd.DateOffset(months=6)
    )
    row['max price after 9 months'], row['min price after 9 months'] = get_stock_prices(
        short_name, base_date + pd.DateOffset(months=6), base_date + pd.DateOffset(months=9)
    )
    row['max price after 12 months'], row['min price after 12 months'] = get_stock_prices(
        short_name, base_date + pd.DateOffset(months=9), base_date + pd.DateOffset(months=12)
    )
    return row

# Apply the function to each row
df_saved_reports = df_saved_reports.apply(calculate_prices, axis=1)


In [51]:
df_saved_reports.value_counts("rating")

rating
overweight      275
buy             219
hold            143
none            101
neutral          50
equal            36
underweight      33
v                16
underperform     11
weight           11
sell              7
unchanged         6
outperform        3
basis             2
long-termbuy      2
change            1
b                 1
Name: count, dtype: int64

In [52]:
# Manually updated some missing data and imported it back to the notebook for further steps
df_updated_reports = pd.read_csv("updated_target_prices_ratings.csv")
df_updated_reports['date'] = pd.to_datetime(df_updated_reports['date'], format="%Y-%m-%d")
df_updated_reports['date'] = df_updated_reports['date'].dt.date

df_updated_reports["paragraphs"] = df_updated_reports["paragraphs"].apply(ast.literal_eval)
df_updated_reports = df_updated_reports[["filename","target_price","rating"]]
df_updated_reports

Unnamed: 0,filename,target_price,rating
0,20161018_Needham_META_Facebook-_3Q16_Preview_R...,150.00,buy
1,20200807_Wells_Fargo_META_FB-_2.5B_Person_Plat...,300,overweight
2,20200918_Barclays_GM_General_Motors-_Time_to_t...,39.00,overweight
3,20200722_Barclays_BKNG_Booking_Holdings_Inc.-_...,1950.00,overweight
4,20211104_Deutsche_Bank_X_US_Steel-_3Q21_EBITDA...,50.00,buy
...,...,...,...
912,20150522_Gilford_Securities_Inc_FL_Report_rece...,,buy
913,20200526_Wells_Fargo_HD_HD-_Q2_Rollover_Falls_...,270,overweight
914,20220125_Deutsche_Bank_WFC_Wells_Fargo-_More_C...,55.00,buy
915,20200212_Barclays_GM_General_Motors-_Waiting_p...,44.00,overweight


In [53]:
# Merge manually updated data back to existing df (target prices and ratings were manually checked)
df_merged = df_saved_reports.merge(df_updated_reports, on='filename', how='left', suffixes=('', '_new'))
columns_to_update = ['target_price', 'rating']
for col in columns_to_update:
    df_merged[col] = df_merged[f'{col}_new'].combine_first(df_merged[col])

df_merged = df_merged.drop(columns=[f'{col}_new' for col in columns_to_update])

In [54]:
df_saved_reports = df_merged

# Adjust target price for companies that had a split

In [55]:
df_saved_reports["target_price"] = (
    df_saved_reports["target_price"]
    .replace(",", "", regex=True)  # Remove commas
    .pipe(pd.to_numeric, errors="coerce")  # Convert to float; invalid values become NaN
)

In [56]:
import pandas as pd
from datetime import datetime

# based on Bloomberg data
stock_splits = {
    "AAPL": [
        {"split_ratio": 7, "ex_date": "2014-06-09"},
        {"split_ratio": 4, "ex_date": "2020-08-31"}
    ],
    "AMZN": [
        {"split_ratio": 20, "ex_date": "2022-06-06"}
    ],
    "TSLA": [
        {"split_ratio": 5, "ex_date": "2020-08-31"},
        {"split_ratio": 3, "ex_date": "2022-08-25"}
    ],
    "GOOGL": [
        {"split_ratio": 20, "ex_date": "2022-07-18"},
        {"split_ratio": 1.0027455, "ex_date": "2015-04-27"},
        {"split_ratio": 2.002, "ex_date": "2014-03-27"},
    ],
    "NVDA": [
        {"split_ratio": 10, "ex_date": "2024-06-10"},
        {"split_ratio": 4, "ex_date": "2021-07-20"}
    ],
    "NKE": [
        {"split_ratio": 2, "ex_date": "2015-12-24"}
    ],
    "V": [
        {"split_ratio": 4, "ex_date": "2015-03-19"}
    ],
    "MA": [
        {"split_ratio": 10, "ex_date": "2014-01-22"}
    ],
    "WMT": [
        {"split_ratio": 3, "ex_date": "2024-02-26"}
    ],
    "SBUX": [
        {"split_ratio": 2, "ex_date": "2015-04-09"}
    ],
    "UNP": [
        {"split_ratio": 2, "ex_date": "2014-06-09"}
    ],
    "UAA": [
        {"split_ratio": 2, "ex_date": "2014-04-15"}
    ],
    "NBR": [
        {"split_ratio": 0.02, "ex_date": "2020-04-23"}
    ],
}

def adjust_target_price(target_price, report_date, ticker, stock_splits):
    if pd.isna(target_price):
        return None  # Handle NaN target prices
      
    splits = stock_splits.get(ticker, [])
    
    # update target price based on timeframe of the split
    for split in splits:
        split_date = datetime.strptime(split["ex_date"], "%Y-%m-%d").date()
        if split_date > report_date:
            target_price /= split["split_ratio"]
    
    return round(target_price,2)

# Apply the function to the DataFrame
df_saved_reports["adjusted_target_price"] = df_saved_reports.apply(
    lambda row: adjust_target_price(
        target_price=row["target_price"],
        report_date=row["date"],
        ticker=row["ticker"],
        stock_splits=stock_splits
    ),
    axis=1
)

df_saved_reports


Unnamed: 0,document_id,filename,date,provider,ticker,company_name,industry,paragraphs,target_price,rating,...,one day after,max price after 3 months,min price after 3 months,max price after 6 months,min price after 6 months,max price after 9 months,min price after 9 months,max price after 12 months,min price after 12 months,adjusted_target_price
0,1,20161018_Needham_META_Facebook-_3Q16_Preview_R...,2016-10-11,Needham,META,Meta Platforms Inc.,Communication Services,[INVESTMENT HIGHLIGHTS: $150.00 We raise our e...,150.0,buy,...,129.05,133.28,115.05,142.65,126.09,155.27,139.39,173.51,155.27,150.00
1,2,20200807_Wells_Fargo_META_FB-_2.5B_Person_Plat...,2020-07-31,Wells Fargo,META,Meta Platforms Inc.,Communication Services,"[Solid 2Q Beat, E-Commerce Initiatives Remain ...",300.0,overweight,...,253.67,303.91,248.15,294.68,245.64,329.51,254.69,373.28,302.55,300.00
2,3,20200918_Barclays_GM_General_Motors-_Time_to_t...,2020-09-14,Barclays,GM,General Motors Co.,Automobile,[CEO Mary Barra earlier today presented at a c...,39.0,overweight,...,31.58,46.46,28.74,59.26,40.51,63.92,53.76,61.76,48.18,39.00
3,4,20200722_Barclays_BKNG_Booking_Holdings_Inc.-_...,2020-07-16,Barclays,BKNG,Booking Holdings Inc,Travel,[The Key Takeaway: BKNGs 2Q results shouldnt b...,1950.0,overweight,...,1732.19,1948.73,1638.47,2281.54,1604.13,2476.90,1886.09,2505.10,2144.72,1950.00
4,5,20211104_Deutsche_Bank_X_US_Steel-_3Q21_EBITDA...,2021-10-28,Deutsche Bank,X,United States Steel,Materials,"[3Q21 FCF of $1.3bn, ND down 46% QoQ, boosts S...",50.0,buy,...,26.39,26.88,18.59,38.45,19.54,32.23,17.02,25.84,18.12,50.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,913,20150522_Gilford_Securities_Inc_FL_Report_rece...,2015-05-22,Gilford Securities Inc,FL,Foot Locker,Clothing,[Investment Opinion: Foot Locker continues to ...,,buy,...,63.46,74.12,62.00,75.76,58.04,69.14,60.65,67.25,54.77,
913,914,20200526_Wells_Fargo_HD_HD-_Q2_Rollover_Falls_...,2020-05-19,Wells Fargo,HD,Home Depot Inc.,Consumer Discretionary,"[With Q1 strength continuing into May, seeing ...",270.0,overweight,...,238.19,288.24,238.10,291.93,265.70,285.08,261.72,341.12,250.93,270.00
914,915,20220125_Deutsche_Bank_WFC_Wells_Fargo-_More_C...,2022-01-17,Deutsche Bank,WFC,Wells Fargo & Co.,Financials,[WFC posted a solid and mostly in-line 4Q deta...,55.0,buy,...,56.69,59.06,45.81,48.65,37.43,46.14,40.01,47.95,40.68,55.00
915,916,20200212_Barclays_GM_General_Motors-_Waiting_p...,2020-02-06,Barclays,GM,General Motors Co.,Automobile,[NEUTRAL Industry View Solid execution and ong...,44.0,overweight,...,33.63,35.49,16.80,30.68,21.46,37.47,26.62,55.86,37.47,44.00


# Calculate performance of Target Prices

In [57]:
#If the adjusted_target_price is below the current price, we need to check if it was below the adjusted_target_price, basically handling buy/sell/hold recommendations
def classifcy_performance(df):
    if df["adjusted_target_price"] > df["start price"]:
        df["tp reached after 3 months"] = df["adjusted_target_price"] <= df["max price after 3 months"]
        df["tp reached after 6 months"] = df["adjusted_target_price"] <= df["max price after 6 months"]
        df["tp reached after 9 months"] = df["adjusted_target_price"] <= df["max price after 9 months"]
        df["tp reached after 12 months"] = df["adjusted_target_price"] <= df["max price after 12 months"]
    else:
        df["tp reached after 3 months"] = df["adjusted_target_price"] >= df["min price after 3 months"]
        df["tp reached after 6 months"] = df["adjusted_target_price"] >= df["min price after 6 months"]
        df["tp reached after 9 months"] = df["adjusted_target_price"] >= df["min price after 9 months"]
        df["tp reached after 12 months"] = df["adjusted_target_price"] >= df["min price after 12 months"]

    return df

df_saved_reports = df_saved_reports.apply(classifcy_performance, axis=1)

In [58]:
df_saved_reports.to_csv("reports.csv")

# Clean/Filter Paragraphs

In [None]:
df_saved_reports.to_csv("/Users/oskarroeske/Masterthesis/argument_extraction/preprocessed_reports.csv")

In [60]:
df_saved_reports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917 entries, 0 to 916
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   document_id                 917 non-null    int64  
 1   filename                    917 non-null    object 
 2   date                        917 non-null    object 
 3   provider                    917 non-null    object 
 4   ticker                      917 non-null    object 
 5   company_name                917 non-null    object 
 6   industry                    917 non-null    object 
 7   paragraphs                  917 non-null    object 
 8   target_price                706 non-null    float64
 9   rating                      917 non-null    object 
 10  start price                 917 non-null    float64
 11  one day after               917 non-null    float64
 12  max price after 3 months    917 non-null    float64
 13  min price after 3 months    917 non

In [61]:
# Explode the cleaned list into separate rows
df_exploded = df_saved_reports.explode('paragraphs').reset_index(drop=True)
df_exploded = df_exploded[["filename","document_id","provider","ticker","date","industry","paragraphs"]]
df_exploded['paragraph_id'] = df_exploded.groupby('document_id').cumcount() + 1

In [62]:
df_exploded

Unnamed: 0,filename,document_id,provider,ticker,date,industry,paragraphs,paragraph_id
0,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,Needham,META,2016-10-11,Communication Services,INVESTMENT HIGHLIGHTS: $150.00 We raise our es...,1
1,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,Needham,META,2016-10-11,Communication Services,We expect mobile ad revenue to represent appro...,2
2,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,Needham,META,2016-10-11,Communication Services,We raise our estimates for FY16 as we expect a...,3
3,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,Needham,META,2016-10-11,Communication Services,We raise our estimates for FY17 and now expect...,4
4,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,Needham,META,2016-10-11,Communication Services,We are buyers of FB based on our belief that d...,5
...,...,...,...,...,...,...,...,...
13235,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,Pivotal Research Group,GOOGL,2014-01-21,Communication Services,Investors will need to consider the following ...,25
13236,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,Pivotal Research Group,GOOGL,2014-01-21,Communication Services,Much of online advertising is highly competiti...,26
13237,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,Pivotal Research Group,GOOGL,2014-01-21,Communication Services,SMEs have been the core . segment of marketers...,27
13238,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,Pivotal Research Group,GOOGL,2014-01-21,Communication Services,A looming threat for all web publishers relate...,28


# Turn paragraphs into sentences

In [63]:
import spacy
import pandas as pd
from spacy.language import Language
from spacy.pipeline import Sentencizer

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")

# Add a customized sentence boundary detection component to spaCy
@Language.component("custom_sentencizer")
def custom_sentencizer(doc):

    for token in doc[:-1]:
        # Always split at '!' or '?'
        if token.text in ["!", "?"]:
            doc[token.i + 1].is_sent_start = True

        # Split at '.' only if not part of a number or abbreviation
        if token.text == ".":
            next_token = token.nbor(1) if token.i + 1 < len(doc) else None
            prev_token = token.nbor(-1) if token.i - 1 >= 0 else None
            prev_prev_token = token.nbor(-2) if token.i - 2 >= 0 else None
            
            # Check if next token starts a title-case word
            if next_token and next_token.is_title:
                doc[token.i + 1].is_sent_start = True
            
            # Prevent splitting for abbreviations
            if prev_token and prev_token.text.lower() in {"mr", "ms", "dr", "etc", "e.g", "adj", "sr"}:
                doc[token.i + 1].is_sent_start = False
            
            # Prevent splitting within numbers (e.g., 3.14)
            if prev_token and prev_token.like_num and next_token and next_token.like_num:
                doc[token.i + 1].is_sent_start = False

            # Prevent splitting if there are at least two uppercase words before the period
            if (
                prev_prev_token and prev_prev_token.text.isupper() and
                prev_token and prev_token.text.isupper() and
                next_token and next_token.is_lower
            ):
                doc[token.i + 1].is_sent_start = False

    return doc
# Add the custom sentencizer to the pipeline
nlp.add_pipe("custom_sentencizer", before="parser")

# Function to split paragraphs into sentences
def split_into_sentences(text):
    if not isinstance(text, str):
        return []  # Return an empty list for non-string values
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

result = []

def check_sentence_validity(sentence):
    sentence = re.sub(r"\s+", " ", sentence).strip()
    has_verb = False
    has_subject = False

    sentence = nlp(sentence)
        
    for token in sentence:
        # Check for a verb
        if token.pos_ in {"VERB", "AUX"}:
            has_verb = True
        # Check for a subject
        if token.dep_ in {"nsubj", "nsubjpass"}:
            has_subject = True
 
    # If both a verb and a subject are found, the sentence is valid
    if has_verb and has_subject:
        return True
    return False

# Split each paragraph into sentences
for _, row in df_exploded.iterrows():
    filename = row['filename']
    doc_id = row['document_id']
    paragraph_id = row['paragraph_id']
    text = row['paragraphs']
    sentences = split_into_sentences(text)
    for sentence_id, sentence in enumerate(sentences, start=1):
        is_valid_sentence = check_sentence_validity(sentence)
        # Check if the sentence has more than 5 words
        if len(sentence.split()) >= 5 and is_valid_sentence:
            result.append({
                'filename':filename,
                'document_id': doc_id,
                'paragraph_id': paragraph_id,
                'sentence_id': sentence_id,
                'sentence': sentence
            })

# Create a new DataFrame with the results
df_preprocessed_sentences = pd.DataFrame(result)

# Filter sentences from the table of contents -> caused issues before
df_preprocessed_sentences = df_preprocessed_sentences[~df_preprocessed_sentences['sentence'].str.contains(r"\.{5,}", na=False)]

In [64]:
df_preprocessed_sentences

Unnamed: 0,filename,document_id,paragraph_id,sentence_id,sentence
0,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,1,1,INVESTMENT HIGHLIGHTS: $150.00 We raise our es...
1,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,1,2,We now expect 3Q16 revenue of $6.855B up 52% y...
2,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,1,3,"FB will report 3Q16 earnings on Wednesday, Nov..."
3,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,1,4,"The call in number is , ID ."
4,20161018_Needham_META_Facebook-_3Q16_Preview_R...,1,1,6,Advertising We maintain our Buy rating and $15...
...,...,...,...,...,...
36459,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,27,6,Changes in that industrys presence or its reli...
36460,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,28,1,A looming threat for all web publishers relate...
36461,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,28,2,Rules could be established in the future which...
36462,20140122_Pivotal_Research_Group_GOOGL_GOOG-_Gr...,917,29,1,Google has become so .


In [None]:
df_preprocessed_sentences.to_csv("/Users/oskarroeske/Masterthesis/argument_extraction/preprocessed_sentences.csv")

In [66]:
# Concatenate sentences back into paragraphs
df_preprocessed_paragraphs = (
    df_preprocessed_sentences
    .groupby(['filename','document_id','paragraph_id'])  # Group by paragraph_id
    .agg({'sentence': ' '.join})  # Concatenate sentences within each group
    .reset_index()  # Reset index to keep paragraph_id as a column
)

# Rename the column to 'paragraph' for clarity
df_preprocessed_paragraphs.rename(columns={'sentence': 'paragraph'}, inplace=True)

In [67]:
df_preprocessed_paragraphs

Unnamed: 0,filename,document_id,paragraph_id,paragraph
0,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,1,"Last night, ahead of the 2014 CES, NVDA hosted..."
1,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,2,GameStream works hand in hand with the PC gami...
2,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,3,NVDA sought to bridge the gap between PC and m...
3,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,4,Recognizing the shift to advanced automobile f...
4,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,5,NVDA has spent heavily on its mobile applicati...
...,...,...,...,...
12568,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,1,Even though available at quite at a lag with t...
12569,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,2,"Coming back to the October data, International..."
12570,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,3,International shipments Apple till October hav...
12571,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,4,5G mix accounted for 80% of total shipments in...


In [68]:
paragraph_counts = df_preprocessed_paragraphs.groupby("document_id").size().reset_index(name="paragraph_count")

In [69]:
paragraph_counts[paragraph_counts["paragraph_count"]==1]

Unnamed: 0,document_id,paragraph_count
51,52,1
593,595,1
690,692,1


In [70]:
df_preprocessed_paragraphs[df_preprocessed_paragraphs["document_id"]==692]

Unnamed: 0,filename,document_id,paragraph_id,paragraph
2655,20170302_BTIG_PRGO_Perrigo_Company_plc.pdf,692,1,Shares of PRGO were down more than 10% after t...


In [71]:
df_saved_reports[df_saved_reports["document_id"]==584]

Unnamed: 0,document_id,filename,date,provider,ticker,company_name,industry,paragraphs,target_price,rating,...,min price after 6 months,max price after 9 months,min price after 9 months,max price after 12 months,min price after 12 months,adjusted_target_price,tp reached after 3 months,tp reached after 6 months,tp reached after 9 months,tp reached after 12 months
583,584,20150323_Needham_PRGO_PRGO-_RX_Metrics-_PRGO_S...,2015-03-16,Needham,PRGO,Perrigo,Healthcare,[Enclosed within the following report are week...,,hold,...,177.07,182.39,145.59,149.45,124.08,,False,False,False,False


In [None]:
df_preprocessed_paragraphs.to_csv("/Users/oskarroeske/Masterthesis/argument_extraction/preprocessed_paragraphs.csv")

In [73]:
df_preprocessed_paragraphs

Unnamed: 0,filename,document_id,paragraph_id,paragraph
0,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,1,"Last night, ahead of the 2014 CES, NVDA hosted..."
1,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,2,GameStream works hand in hand with the PC gami...
2,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,3,NVDA sought to bridge the gap between PC and m...
3,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,4,Recognizing the shift to advanced automobile f...
4,20140114_Needham_NVDA_NVDA-_Compelling_Technol...,512,5,NVDA has spent heavily on its mobile applicati...
...,...,...,...,...
12568,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,1,Even though available at quite at a lag with t...
12569,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,2,"Coming back to the October data, International..."
12570,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,3,International shipments Apple till October hav...
12571,20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...,203,4,5G mix accounted for 80% of total shipments in...


In [74]:
from transformers import GPT2Tokenizer
import pandas as pd

In [75]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [76]:
df_preprocessed_paragraphs['token_count'] = df_preprocessed_paragraphs['paragraph'].apply(lambda x: len(tokenizer.tokenize(x)))

In [77]:
# Find the maximum token count
max_tokens = df_preprocessed_paragraphs['token_count'].max()
print(f"Maximum tokens in the column: {max_tokens}")

Maximum tokens in the column: 1009


In [78]:
# Calculate df_preprocessed_paragraphs
percentiles = df_preprocessed_paragraphs['token_count'].quantile([0.25, 0.5, 0.75, 0.9,0.95,0.97,0.98,0.99,1.0])

# Display percentiles
print("Percentiles for token counts:")
print(percentiles)

Percentiles for token counts:
0.25      37.00
0.50      73.00
0.75     127.00
0.90     194.00
0.95     249.00
0.97     298.00
0.98     338.56
0.99     420.28
1.00    1009.00
Name: token_count, dtype: float64
