In [65]:
!pip install spacy tabula https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m


In [66]:
import numpy
import pandas
import scipy
print(f"numpy version: {numpy.__version__}")
print(f"pandas version: {pandas.__version__}")
print(f"scipy version: {scipy.__version__}")


numpy version: 1.26.0
pandas version: 2.2.3
scipy version: 1.11.3


In [67]:
# Packages
import os
import re
import pdfplumber
import pandas as pd
import spacy
from datetime import datetime

In [68]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load the provider info CSV for ending keywords
provider_info = pd.read_csv('provider.csv')

# Load the company info CSV for ticker validation and company metadata
company_info = pd.read_csv('company_info.csv')  # Replace with the actual path

company_info = company_info.drop_duplicates(subset='Ticker Symbol')

# Create a dictionary to map ticker symbols to company name and industry
ticker_map = company_info.set_index('Ticker Symbol')[['Company Name', 'Industry']].to_dict(orient='index')

# Testing

In [85]:
import pdfplumber

def extract_words_with_formatting(page):
    """
    Extracts words along with their formatting details such as font size and font name.

    Args:
        page (pdfplumber.Page): A single page from the PDF.

    Returns:
        List[Dict]: A list of dictionaries containing words and their formatting details.
    """
    # Extract words with their bounding boxes
    words = page.extract_words(extra_attrs=["fontname", "size"])

    formatted_words = []
    for word in words:
        formatted_words.append({
            "word": word["text"],
            "font": word.get("fontname", "Unknown"),
            "size": word.get("size", "Unknown"),
            "x0": word["x0"],
            "x1": word["x1"],
            "top": word["top"],
            "bottom": word["bottom"]
        })

    return formatted_words


# Example usage with pdfplumber
pdf_path = "/Users/oskarroeske/Masterthesis/preprocessing/reports/20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf"

with pdfplumber.open(pdf_path) as pdf:
    for page_number, page in enumerate(pdf.pages, start=1):
        print(f"Page {page_number}:")
        formatted_words = extract_words_with_formatting(page)
        for word_info in formatted_words:
            print(
                f"Word: '{word_info['word']}', Font: {word_info['font']}, Size: {word_info['size']}, "
                f"Position: ({word_info['x0']}, {word_info['top']} - {word_info['x1']}, {word_info['bottom']})"
            )


Page 1:
Word: 'January', Font: Tahoma, Size: 6.960000000000036, Position: (514.56, 42.47375999999997 - 538.4195240749391, 49.43376000000001)
Word: '27,', Font: Tahoma, Size: 6.960000000000036, Position: (540.4943634543931, 42.47375999999997 - 550.2648991301813, 49.43376000000001)
Word: '2014', Font: Tahoma, Size: 6.960000000000036, Position: (552.4943681928246, 42.47375999999997 - 567.7929826699193, 49.43376000000001)
Word: 'Colin', Font: Tahoma-Bold, Size: 9.120000000000005, Position: (505.44, 70.75872000000004 - 528.065752209794, 79.87872000000004)
Word: 'W.', Font: Tahoma-Bold, Size: 9.120000000000005, Position: (530.6400087141317, 70.75872000000004 - 542.8467696441318, 79.87872000000004)
Word: 'Gillis', Font: Tahoma-Bold, Size: 9.120000000000005, Position: (545.5200155153731, 70.75872000000004 - 567.487928752825, 79.87872000000004)
Word: 'Technology', Font: Tahoma, Size: 6.960000000000036, Position: (476.64, 81.59375999999997 - 511.78111855334976, 88.55376000000001)
Word: 'Analyst'

In [92]:
import pdfplumber

def extract_paragraphs_with_format(page, text_patterns):
    """
    Extracts paragraphs from a PDF page with filtering by font and size.

    Args:
        page (pdfplumber.Page): The PDF page object.
        text_patterns (dict): A dictionary containing font and size patterns to filter.

    Returns:
        List[Dict]: A list of dictionaries with paragraphs and their formatting details.
    """
    # Extract words with font and size details
    words = page.extract_words(extra_attrs=["fontname", "size"])

    # Round text sizes to 3 decimal places
    for word in words:
        if "size" in word and word["size"] is not None:
            word["size"] = round(word["size"], 3)

    # Sort words by vertical and horizontal position
    words.sort(key=lambda w: (w["top"], w["x0"]))

    paragraphs = []  # To store paragraphs
    current_paragraph = []  # Current paragraph being constructed
    current_top = None  # Track the top position for grouping lines

    primary_size = round(text_patterns.get("primary_text_size"), 3)
    primary_font = text_patterns.get("primary_text_font")

    for word in words:
        # Filter words by font and size
        #print(word["text"])
        #print(word["fontname"])
        #print(word["size"])
        if word["fontname"] != primary_font or word["size"] != primary_size:
            continue

        # Check if the word belongs to the same paragraph
        if current_top is None or abs(word["top"] - current_top) < 10:  # Adjust threshold for line grouping
            current_paragraph.append(word["text"])
        else:
            # New paragraph starts
            paragraphs.append(" ".join(current_paragraph))
            current_paragraph = [word["text"]]

        # Update the current top position
        current_top = word["top"]

    # Add the last paragraph
    if current_paragraph:
        paragraphs.append(" ".join(current_paragraph))

    return paragraphs


# Example usage with a PDF file
provider_text_patterns = {
    "Provider1": {"primary_text_size": 7.92, "primary_text_font": "Tahoma"},
    "Provider2": {"primary_text_size": 10.5, "primary_text_font": "Helvetica-Bold"},
}

pdf_path = "/Users/oskarroeske/Masterthesis/preprocessing/reports/20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf"
provider = "Provider1"  # Adjust based on the provider
text_patterns = provider_text_patterns.get(provider)

if text_patterns:
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            paragraphs = extract_paragraphs_with_format(page, text_patterns)
            print(f"Page {page_number}:")
            for p in paragraphs:
                print(f"  Paragraph: {p}")
else:
    print(f"No text patterns defined for provider: {provider}")


Page 1:
  Paragraph: PRICE PERFORMANCE CHART The risks seem balanced, to the upside and downside, with good news priced in.
  Paragraph: Shares of Apple have produced a total return (dividends reinvested) of 42.5% since our upgrade on April 22, 2013 compared to a total return of the S&P500 index of 16.4% (dividends reinvested). We are reducing our rating to HOLD from BUY as the stock has reached our $550 price target and we expect the performance of the stock to track more in line with the broader market index.
  Paragraph: We expect Apple to post an astounding quarter with over 80 million iOS units sold, powered by the view that the company makes the best products in the space. That said, we see that the market has already broadly anticipated a record quarter, and the upside risks seem balanced with downside risks in our view. Yes, we could be giving away some upside on the reaction to the December quarter print, particularly if the company can ship 57-60 million iPhones or produce re

In [77]:
# So far working the best
import pdfplumber

def extract_paragraphs(page):
    # Extract words with their bounding boxes
    words = page.extract_words()

    # Sort words by their vertical position (top coordinate)
    words.sort(key=lambda w: (w["top"], w["x0"]))

    paragraphs = []
    current_paragraph = []
    current_top = None

    for word in words:
        # Check if the word is part of the same paragraph
        if current_top is None or abs(word["top"] - current_top) < 10:  # Adjust threshold as needed
            current_paragraph.append(word["text"])
        else:
            # New paragraph starts
            paragraphs.append(" ".join(current_paragraph))
            current_paragraph = [word["text"]]

        # Update the top position
        current_top = word["top"]

    # Add the last paragraph
    if current_paragraph:
        paragraphs.append(" ".join(current_paragraph))

    return paragraphs

# Example usage with a PDF file
with pdfplumber.open("/Users/oskarroeske/Masterthesis/preprocessing/reports/20140203_BGC_Partners_AAPL_BGC_AAPL_01272014.pdf") as pdf:
    for page in pdf.pages:
        paragraphs = extract_paragraphs(page)
        for p in paragraphs:
            print("Paragraph:", p)


Paragraph: January 27, 2014
Paragraph: Colin W. Gillis
Paragraph: Technology Analyst / Director cgillis@bgcpartners.com 646.346.7052 cell: 917.921.8616 BGC Financial L.P.
Paragraph: Technology Research
Paragraph: Apple Inc. HOLD (AAPL, $552.19)
Paragraph: APPLE EARNINGS PREVIEW: DOWNGRADE TO HOLD AS SHARES HIT PRICE TARGET.
Paragraph: PRICE PERFORMANCE CHART The risks seem balanced, to the upside and downside, with good news priced in.
Paragraph: Shares of Apple have produced a total return (dividends reinvested) of 42.5% since our upgrade on April 22, 2013 compared to a total return of the S&P500 index of 16.4% (dividends reinvested). We are reducing our rating to HOLD from BUY as the stock has reached our $550 price target and we expect the performance of the stock to track more in line with the broader market index.
Paragraph: What about the Record Holiday Quarter?
Paragraph: We expect Apple to post an astounding quarter with over 80 million iOS units sold, powered by the view that 

In [25]:
import re

# Define a mapping for providers and their patterns
patterns = {
    "BGC Partners": {
        "primary_price_pattern": r"Price Target \(\$\) (\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) \(\w+,",
        "primary_ending_pattern" :r"Disclosures Appendix"
    },
    "Needham": {
        "primary_price_pattern": r"Price Target: \$(\d+(\,\d+)?)",
        "secondary_price_pattern": r"PRICE TARGET: \$(\d+(\,\d+)?)",
        "primary_rating_pattern": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
        "secondary_rating_pattern": r"Rating (\w+)",
        "primary_ending_pattern" :r"Analyst Certification"

    },
    "BTIG": {
        "primary_price_pattern": r"12 month target \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
        "primary_ending_pattern" :r"Appendix: Analyst Certification and Other Important Disclosures"

    },
    "Wells Fargo": {
        "primary_price_pattern": r"/\$(\d+(\.\d+)?)",
        "secondary_price_pattern": r"Price Target\/Prior: \$(\d+(\.\d+)?)",  
        "primary_rating_pattern": r"([A-Za-z]+)/\$",
        "secondary_rating_pattern": r"Rating (\w+)",
        "primary_ending_pattern" :r"Required Disclosures"
    },
    "Barclays": {
        "primary_price_pattern": r"Price Target USD (\d+(\.\d+)?)",
        "primary_rating_pattern": r"Stock Rating ([A-Za-z]+)",
        "primary_ending_pattern" :r"ANALYST(S) CERTIFICATION(S)"
    },
    "JP Morgan": {
        "primary_price_pattern": r"Price Target \([A-Za-z0-9]+\): \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) [A-Za-z]+",
        "primary_ending_pattern" :r"Analyst Certification"
    },
    "Brean Capital LLC": {
        "primary_price_pattern": r"PT: \$ (\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) PT:\$",
        "primary_ending_pattern" :r"Analyst Certification"
    },
    "Hilliard Lyons": {
        "primary_price_pattern": r"Price Target (\d+(\.\d+)?)",
        "primary_rating_pattern": r"-- ([A-Za-z]+) --",
        "primary_ending_pattern" :r"Analyst Certification"
    },
    "Alliance Global Partners": {
        "primary_price_pattern": r"Price Target \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) \(Ticker:",
        "primary_ending_pattern" :r"Imporant Research Disclosures"
    },
    "Mizuho Securities": {
        "primary_price_pattern": r"Price Target \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"Rating ([A-Za-z]+)",
        "primary_ending_pattern" :r"IMPORTANT DISCLOSURES"
    },
    "Gilford Securities Inc": {
        "primary_price_pattern": r"\, \$(\d+(\.\d+)?)", 
        "primary_rating_pattern": r"Rated: ([A-Za-z]+)",
        "secondary_rating_pattern": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
        "primary_ending_pattern" :r"ANALYST CERTIFICATION"

    },
    "Deutsche Bank": {
        "primary_price_pattern": r"Price Target \(USD\) (\d+(\.\d+)?)",
        "primary_rating_pattern": r"Rating ([A-Za-z]+)",
        "primary_ending_pattern" :r"Appendix 1"
    },
    "Pivotal Research Group": {
        "primary_price_pattern": r"Target Price: \$(\d+(,\d+)?)",
        "primary_rating_pattern": r"RATING: ([A-Za-z]+)",
        "primary_ending_pattern" :r"Appendix: Important Disclosures"
    },
    "Spartan Capital Securities LLC": {
        "primary_price_pattern": r"T \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+)",
        "primary_ending_pattern" :r"Important Disclosures"
    },
    "Cascend Securities -Historical-": {
        "primary_price_pattern": r"Price target: \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"Rating: ([A-Za-z]+)",
        "primary_ending_pattern" :r"Disclosures: "
    },
    "Phillip Securities": {
        "primary_price_pattern": r"TARGET PRICE USD (\d+(\.\d+)?)",
        "primary_rating_pattern": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b \(",
        "primary_ending_pattern" :r"Contact Information"
    },
    "FinTrust Investment Advisors": {
        "primary_price_pattern": r"Target Price: \$(\d+(,\d+)?)",
        "primary_rating_pattern": r"Fintrust Rating: ([A-Za-z]+)",
        "primary_ending_pattern" :r"Important Disclosures:"
    },
    "IBI Investment House":{
        "primary_price_pattern": r"Price target: \$(\d+(,\d+)?)",
        "primary_rating_pattern": r"Recommendation: ([A-Za-z]+)",
        "primary_ending_pattern" :r"Disclosures"
    }
}

In [45]:
# NEW VERSION (16.11.2024)
def extract_metadata(filename, ticker_map):
    """
    Extract metadata (date, provider, ticker) from the filename using ticker_map.
    """
    # Extract the date (first 8 digits in the filename)
    date_match = re.match(r"(\d{8})", filename)
    if not date_match:
        return None, None, None, None, None
    date_str = date_match.group(1)
    date = datetime.strptime(date_str, "%Y%m%d")

    # Look for the ticker in the filename
    for ticker in ticker_map.keys():
        ticker_pattern = f"_{ticker}_"  # Ensure ticker is surrounded by underscores
        if ticker_pattern in filename:
            # Extract the portion between date and ticker as the provider
            provider_section = filename.split(f"{date_str}_")[1].split(f"_{ticker}_")[0]
            provider = provider_section.replace('_', ' ')  # Replace underscores with spaces
            # Get company name and industry from the ticker_map
            company_name = ticker_map[ticker]['Company Name']
            industry = ticker_map[ticker]['Industry']
            return date, provider, ticker, company_name, industry

    # If no ticker is found, return None for ticker-related fields
    return date, None, None, None, None

# Define function to clean the extracted text
def clean_text(text):
    # Normalize the text to ensure uniform whitespace
    #text = re.sub(r"\s+", " ", text).strip()

    # Remove email addresses and phone numbers
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"\(\d{1,3}\)[-\s]?\d{3}[-.\s]?\d{4}", "", text)
    text = re.sub(r"This document is being provided.*?WIRT BERLIN\.", "", text, flags=re.DOTALL | re.IGNORECASE)    # Replace multiple newlines and tabs with a single space
    return text

# Define function to clean text by removing content after ending keywords
def remove_after_ending_keyword(text, provider):
    if provider not in patterns:
        raise ValueError(f"No patterns defined for provider: {provider} + text {text}")

    primary_ending_pattern = patterns[provider].get("primary_ending_pattern")
    if not primary_ending_pattern:
        raise ValueError(f"No primary Pattern defined for provider: {provider}")
    
    split_text = re.split(primary_ending_pattern,text,maxsplit=1)

    return split_text[0]

# Define function to extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract the main text from the page
            text = page.extract_text() or ""  # Use an empty string if no text is found
            
            # Extract tables and remove their content from the text
            for table in page.extract_tables():
                # Safely handle None values in the table
                table_data = "\n".join(
                    ["\t".join(cell if cell is not None else "" for cell in row) for row in table]
                )
                if table_data in text:
                    text = text.replace(table_data, "")  # Remove table data from text
            
            extracted_text += text + "\n\n"  # Append cleaned text
    
    return extracted_text

# Define function to process all PDFs in the directory and store data in DataFrame
def extract_text_from_all_pdfs_to_dataframe(directory_path, provider_info, ticker_map):
    """Extract text from all PDFs in a specified directory and store in a DataFrame with unique ID."""
    data = []
    id_counter = 1  # Initialize an ID counter
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):  # Process only PDF files
            file_path = os.path.join(directory_path, filename)

             # Extract and clean text from PDF
            text = extract_text_from_pdf(file_path)
            
            # Extract metadata from filename
            date, provider, ticker, company_name, industry = extract_metadata(filename,ticker_map=ticker_map)
            
            text = clean_text(text)  # Apply the clean_text function
            cleaned_text = remove_after_ending_keyword(text, provider)
            
            # Add extracted data to the list
            data.append({
                "ID": id_counter,  # Unique ID
                "filename": filename,
                "date": date,
                "provider": provider,
                "ticker": ticker,
                "company_name": company_name,
                "industry": industry,
                "text": cleaned_text
            })
            id_counter += 1  # Increment the ID counter for the next row

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    return df

# Directory path
pdf_directory = "../preprocessing/reports"  # Replace with your actual folder path

# Run the function and store results in a DataFrame
pdf_df = extract_text_from_all_pdfs_to_dataframe(pdf_directory, provider_info, ticker_map)

# Display the resulting DataFrame to confirm
pdf_df.head()


Unnamed: 0,ID,filename,date,provider,ticker,company_name,industry,text
0,1,20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...,2020-10-01,Pivotal Research Group,AMZN,Amazon.com Inc.,Consumer Discretionary,PIVOTAL\nU.S. Equity Research\nInternet & Medi...
1,2,20190730_FinTrust_Investment_Advisors_AMZN_Fin...,2019-07-30,FinTrust Investment Advisors,AMZN,Amazon.com Inc.,Consumer Discretionary,Special Report - Amazon.com Inc. (AMZN) FinTru...
2,3,20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...,2022-04-27,Wells Fargo,AAPL,Apple Inc.,Technology,"\nEquity Research\nCompany Update — April 20, ..."
3,4,20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf,2015-07-28,BGC Partners,AMZN,Amazon.com Inc.,Consumer Discretionary,"July 21, 2015\nColin W. Gillis\nTechnology Ana..."
4,5,20180206_BTIG_AAPL_Apple-_Inc..pdf,2018-02-06,BTIG,AAPL,Apple Inc.,Technology,"2 0-May-2010, N/A, N/A, N/A, N/A, N/A, N/A, N/..."


In [47]:
sentence = "This document is being provided for the exclusive use of OSKAR ROESKE at HOCHSCHULE FUER TECH & WIRT BERLIN."
filtered_df = pdf_df[pdf_df['text'].str.contains(sentence, case=False, na=False)]

In [48]:
filtered_df

Unnamed: 0,ID,filename,date,provider,ticker,company_name,industry,text


In [46]:
# FOR TESTING
"""import pdfplumber

def extract_logical_text(pdf_path):
    logical_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Get all characters
            chars = page.chars
            
            # Sort characters by their vertical positions (y-coordinate)
            sorted_chars = sorted(chars, key=lambda c: c["top"])
            
            # Group characters into logical blocks based on proximity
            lines = []
            current_line = []
            last_y = None
            
            for char in sorted_chars:
                if last_y is not None and abs(char["top"] - last_y) > 5:
                    # Start a new line if y-coordinates differ significantly
                    lines.append("".join([c["text"] for c in current_line]))
                    current_line = []
                
                current_line.append(char)
                last_y = char["top"]
            
            if current_line:
                lines.append("".join([c["text"] for c in current_line]))
            
            logical_text += "\n".join(lines) + "\n\n"
    
    return logical_text

# Path to your PDF
pdf_path = "/Users/oskarroeske/Masterthesis/preprocessing/reports/20140502_BGC_Partners_AMZN_BGC_AMZN_0425.2014.pdf"

# Extract logically grouped text
logical_text = extract_logical_text(pdf_path)

print("Logically Grouped Text:")
print(logical_text)
"""

'import pdfplumber\n\ndef extract_logical_text(pdf_path):\n    logical_text = ""\n\n    with pdfplumber.open(pdf_path) as pdf:\n        for page in pdf.pages:\n            # Get all characters\n            chars = page.chars\n            \n            # Sort characters by their vertical positions (y-coordinate)\n            sorted_chars = sorted(chars, key=lambda c: c["top"])\n            \n            # Group characters into logical blocks based on proximity\n            lines = []\n            current_line = []\n            last_y = None\n            \n            for char in sorted_chars:\n                if last_y is not None and abs(char["top"] - last_y) > 5:\n                    # Start a new line if y-coordinates differ significantly\n                    lines.append("".join([c["text"] for c in current_line]))\n                    current_line = []\n                \n                current_line.append(char)\n                last_y = char["top"]\n            \n            if cur

In [13]:
# FOR TESTING
"""import pdfplumber
def extract_text_without_tables(pdf_path):
    extracted_text = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract the main text from the page
            text = page.extract_text() or ""  # Use an empty string if no text is found
            
            # Extract tables and remove their content from the text
            for table in page.extract_tables():
                # Safely handle None values in the table
                table_data = "\n".join(
                    ["\t".join(cell if cell is not None else "" for cell in row) for row in table]
                )
                if table_data in text:
                    text = text.replace(table_data, "")  # Remove table data from text
            
            extracted_text += text + "\n\n"  # Append cleaned text
    
    return extracted_text

# Path to your PDF
pdf_path = "/Users/oskarroeske/Masterthesis/preprocessing/reports/20140502_BGC_Partners_AMZN_BGC_AMZN_0425.2014.pdf"  # Replace with your actual file path

# Extract cleaned text
cleaned_text = extract_text_without_tables(pdf_path)

# Output the result
print("Cleaned Text:")
print(cleaned_text)
"""

'import pdfplumber\ndef extract_text_without_tables(pdf_path):\n    extracted_text = ""\n    \n    with pdfplumber.open(pdf_path) as pdf:\n        for page in pdf.pages:\n            # Extract the main text from the page\n            text = page.extract_text() or ""  # Use an empty string if no text is found\n            \n            # Extract tables and remove their content from the text\n            for table in page.extract_tables():\n                # Safely handle None values in the table\n                table_data = "\n".join(\n                    ["\t".join(cell if cell is not None else "" for cell in row) for row in table]\n                )\n                if table_data in text:\n                    text = text.replace(table_data, "")  # Remove table data from text\n            \n            extracted_text += text + "\n\n"  # Append cleaned text\n    \n    return extracted_text\n\n# Path to your PDF\npdf_path = "/Users/oskarroeske/Masterthesis/preprocessing/reports/20140502

In [49]:


def extract_target_and_rating(provider, text):
    if provider not in patterns:
        raise ValueError(f"No patterns defined for provider: {provider} + text {text}")

    #Get all patterns for price and rating
    primary_rating_pattern = patterns[provider].get("primary_rating_pattern")
    secondary_rating_pattern = patterns[provider].get("secondary_rating_pattern")
    primary_price_pattern = patterns[provider].get("primary_price_pattern")
    secondary_price_pattern = patterns[provider].get("secondary_price_pattern")

    # Extract rating
    rating = None
    rating_match = re.search(primary_rating_pattern, text)
    if rating_match:
        rating = rating_match.group(1)
    if rating == None:
        if secondary_rating_pattern:
            rating_match = re.search(secondary_rating_pattern,text)
            if rating_match:
                rating = rating_match.group(1)

    # Try the primary pattern
    price = None
    match = re.search(primary_price_pattern, text)
    if match:
        price = match.group(1)  # Return the first capture group

    # If no match and a secondary pattern exists, try the secondary pattern
    if price == None:
        if secondary_price_pattern:
            match = re.search(secondary_price_pattern, text)
            if match:
                price = match.group(1)  # Return the first capture group

    return rating, price

#apply to each row
def extract_info(row):
    provider = row['provider']
    text = row['text']
    rating, price = extract_target_and_rating(provider, text)
    return pd.Series({'target_price': price, 'rating': rating})

In [50]:
# Apply the function to the dataframe
pdf_df[['target_price', 'rating']] = pdf_df.apply(extract_info, axis=1)

In [51]:
pdf_df

Unnamed: 0,ID,filename,date,provider,ticker,company_name,industry,text,target_price,rating
0,1,20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...,2020-10-01,Pivotal Research Group,AMZN,Amazon.com Inc.,Consumer Discretionary,PIVOTAL\nU.S. Equity Research\nInternet & Medi...,4500,BUY
1,2,20190730_FinTrust_Investment_Advisors_AMZN_Fin...,2019-07-30,FinTrust Investment Advisors,AMZN,Amazon.com Inc.,Consumer Discretionary,Special Report - Amazon.com Inc. (AMZN) FinTru...,1611,HOLD
2,3,20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...,2022-04-27,Wells Fargo,AAPL,Apple Inc.,Technology,"\nEquity Research\nCompany Update — April 20, ...",205.00,Overweight
3,4,20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf,2015-07-28,BGC Partners,AMZN,Amazon.com Inc.,Consumer Discretionary,"July 21, 2015\nColin W. Gillis\nTechnology Ana...",475,HOLD
4,5,20180206_BTIG_AAPL_Apple-_Inc..pdf,2018-02-06,BTIG,AAPL,Apple Inc.,Technology,"2 0-May-2010, N/A, N/A, N/A, N/A, N/A, N/A, N/...",198.00,BUY
...,...,...,...,...,...,...,...,...,...,...
89,90,20220121_Wells_Fargo_JPM_JPM-_Downgrade_Rating...,2022-01-21,Wells Fargo,JPM,JPMorgan Chase & Co.,Financials,"\nEquity Research\nRating Change — January 14,...",210.00,Change
90,91,20150804_BGC_Partners_AAPL_BGC_AAPL_07282015.pdf,2015-08-04,BGC Partners,AAPL,Apple Inc.,Technology,"July 28, 2015\nColin W. Gillis\nTechnology Ana...",115,HOLD
91,92,20210217_Barclays_PEP_PEP_4Q20_Earnings_Prep.pdf,2021-02-17,Barclays,PEP,PepsiCo Inc.,Consumer Staples,CORE\nEquity Research | Instant Insights\n10 F...,,
92,93,20191218_Barclays_JPM_JPM_CFO_Commentary_at_Se...,2019-12-18,Barclays,JPM,JPMorgan Chase & Co.,Financials,CORE\nEquity Research | Instant Insights\n12 D...,,


In [52]:
pdf_df.to_csv("list_of_texts2.csv")

In [128]:
import re
import spacy

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_sm")

def extract_sentences(text):
    """
    Process text to split into sentences.
    """
    # Clean the text first
    cleaned_text = clean_text(text)
    
    # Parse with spaCy
    doc = nlp(cleaned_text)
    
    # Extract sentences as individual strings
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]
    
    return sentences

# Extract sentences
sentences = extract_sentences(pdf_df["text"][0])

# Display the sentences
for i, sentence in enumerate(sentences[:200]):  # Show the first 10 sentences for inspection
    print(f"Sentence {i+1}: {sentence}")


Sentence 1: PIVOTAL U.S. Equity Research Internet & Media Pivotal Research Group AMZN:
Sentence 2: We Think the Street – Buy- and Sell-side are September 30, 2020 Looking at SOTP Wrong (Us Included).
Sentence 3: We are Reframing Why AMZN is the Best Mega-cap on a Multi-year Basis.
Sentence 4: BOTTOM LINE:
Sentence 5: We and almost every other investor we have spoken to over the years, has been MICHAEL LEVINE framing the AMZN SOTP valuation wrong.
Sentence 6: Amazon advertising is only ~5% of revenues, but is far greater 212-514-4682 contributor to overall non-AWS EBIT margins than the street recognizes.
Sentence 7: Said differently, If advertising was viewed as a stand-along business unit (we will explain why it shouldn’t be), it would represent well north of 300% of 2020E non-AWS EBIT.
Sentence 8: Amazon (AMZN) Based on our view that there is massive upside by 2024E, we increase our PT to a Street high of $4,500 based on an average of our 2024 SOTP and 30x our 2024 EBIT “power” of $91

# Remove Outlier Words

In [129]:
"""from transformers import pipeline

# Initialize a masked language model pipeline (using BERT)
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

def remove_outlier_words(sentence):
    words = sentence.split()  # Split the sentence into words
    cleaned_sentence = []

    for i, word in enumerate(words):
        # Replace the current word with a [MASK] token
        masked_sentence = " ".join(words[:i] + ["[MASK]"] + words[i+1:])
        
        # Get predictions for the masked word
        predictions = fill_mask(masked_sentence)
        
        # Check if the original word is among the top predicted words
        top_predictions = [pred['token_str'] for pred in predictions]
        if word.lower() in top_predictions:
            cleaned_sentence.append(word)  # Keep the word if it fits in the context

    return " ".join(cleaned_sentence)  # Rebuild the cleaned sentence

# Example sentence
sentence = " Shipping losses continue to be an issue for 365 the company facing razor thin profitability, but it is a mild positive that losses have stabilized as Price Target ($) percent of total revenue in the last two quarters."
cleaned_sentence = remove_outlier_words(sentence)

print("Cleaned Sentence:", cleaned_sentence)"""

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a mod

Cleaned Sentence: losses continue to be an issue for the company thin but it is a that have of total revenue in the last two


In [7]:
print(pdf_df)
#pdf_df.to_csv("list_of_texts.csv")

                                             filename  \
0   20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...   
1   20190730_FinTrust_Investment_Advisors_AMZN_Fin...   
2   20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...   
3     20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf   
4                  20180206_BTIG_AAPL_Apple-_Inc..pdf   
5   20210820_Phillip_Securities_AAPL_Apple_Inc_Sup...   
6   20220803_Phillip_Securities_AAPL_Apple_Inc_Man...   
7   20150716_JP_Morgan_AAPL_Rod_Hall-s_Daily_Downl...   
8   20171205_JP_Morgan_AMZN_Holiday_eComm_Update-_...   
9   20190731_Cascend_Securities_-Historical-_AAPL_...   
10  20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...   
11  20161017_JP_Morgan_AAPL_Rod_Hall-s_Daily_Downl...   
12  20160510_Phillip_Securities_AAPL_Apple_Inc._Th...   
13  20170804_Needham_AMZN_Mixed_Q1-_Investors_Shou...   
14   20141028_BGC_Partners_AAPL_BGC_AAPL_10212014.pdf   
15                 20190107_BTIG_AAPL_Apple-_Inc..pdf   
16   20160803_BGC_Partners_AAPL