In [3]:
!pip install spacy https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl


Collecting en-core-web-sm==3.5.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)


In [4]:
import numpy
import pandas
import scipy
print(f"numpy version: {numpy.__version__}")
print(f"pandas version: {pandas.__version__}")
print(f"scipy version: {scipy.__version__}")


numpy version: 1.26.0
pandas version: 2.2.3
scipy version: 1.11.3


In [5]:
# Packages
import os
import re
import pdfplumber
import pandas as pd
import spacy
from datetime import datetime

In [9]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load the provider info CSV for ending keywords
provider_info = pd.read_csv('provider.csv')

# Load the company info CSV for ticker validation and company metadata
company_info = pd.read_csv('company_info.csv')  # Replace with the actual path

company_info = company_info.drop_duplicates(subset='Ticker Symbol')

# Create a dictionary to map ticker symbols to company name and industry
ticker_map = company_info.set_index('Ticker Symbol')[['Company Name', 'Industry']].to_dict(orient='index')

In [107]:
def extract_metadata(filename, ticker_map):
    """
    Extract metadata (date, provider, ticker) from the filename using ticker_map.
    """
    # Extract the date (first 8 digits in the filename)
    date_match = re.match(r"(\d{8})", filename)
    if not date_match:
        return None, None, None, None, None
    date_str = date_match.group(1)
    date = datetime.strptime(date_str, "%Y%m%d")

    # Look for the ticker in the filename
    for ticker in ticker_map.keys():
        ticker_pattern = f"_{ticker}_"  # Ensure ticker is surrounded by underscores
        if ticker_pattern in filename:
            # Extract the portion between date and ticker as the provider
            provider_section = filename.split(f"{date_str}_")[1].split(f"_{ticker}_")[0]
            provider = provider_section.replace('_', ' ')  # Replace underscores with spaces
            # Get company name and industry from the ticker_map
            company_name = ticker_map[ticker]['Company Name']
            industry = ticker_map[ticker]['Industry']
            return date, provider, ticker, company_name, industry

    # If no ticker is found, return None for ticker-related fields
    return date, None, None, None, None

# Define function to clean the extracted text
def clean_text(text):
    # Remove email addresses and phone numbers
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"\(\d{1,3}\)[-\s]?\d{3}[-.\s]?\d{4}", "", text)
    text = re.sub(r"This document is being provided for the exclusive use of OSKAR ROESKE at HOCHSCHULE FUER TECH & WIRT BERLIN", "", text)
    # Replace multiple newlines and tabs with a single space
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Define function to clean text by removing content after ending keywords
def remove_after_ending_keyword(text, ending_keyword):
    if ending_keyword:
        # Find the position of the ending keyword in the text
        end_pos = text.lower().find(ending_keyword.lower())
        if end_pos != -1:
            # Return text up to the ending keyword
            return text[:end_pos]
    return text

# Define function to extract text from a single PDF
def extract_text_from_pdf(file_path):
    """Extract text from a single PDF file."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Ensure the page has text
                text += page_text + "\n"
    return text

# Define function to process all PDFs in the directory and store data in DataFrame
def extract_text_from_all_pdfs_to_dataframe(directory_path, provider_info, ticker_map):
    """Extract text from all PDFs in a specified directory and store in a DataFrame with unique ID."""
    data = []
    id_counter = 1  # Initialize an ID counter
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):  # Process only PDF files
            file_path = os.path.join(directory_path, filename)
            
            # Extract metadata from filename
            date, provider, ticker, company_name, industry = extract_metadata(filename,ticker_map=ticker_map)
            
            # Get the ending keyword for the provider, if available
            ending_keyword = provider_info.loc[
                provider_info['file_name'] == provider, 'Ending'
            ].values
            ending_keyword = ending_keyword[0] if len(ending_keyword) > 0 else None
            
            # Extract and clean text from PDF
            text = extract_text_from_pdf(file_path)
            text = clean_text(text)  # Apply the clean_text function
            cleaned_text = remove_after_ending_keyword(text, ending_keyword)
            
            # Add extracted data to the list
            data.append({
                "ID": id_counter,  # Unique ID
                "filename": filename,
                "date": date,
                "provider": provider,
                "ticker": ticker,
                "company_name": company_name,
                "industry": industry,
                "text": cleaned_text
            })
            id_counter += 1  # Increment the ID counter for the next row

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    return df

# Directory path
pdf_directory = "../preprocessing/reports"  # Replace with your actual folder path

# Run the function and store results in a DataFrame
pdf_df = extract_text_from_all_pdfs_to_dataframe(pdf_directory, provider_info, ticker_map)

# Display the resulting DataFrame to confirm
pdf_df.head()


Unnamed: 0,ID,filename,date,provider,ticker,company_name,industry,text
0,1,20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...,2020-10-01,Pivotal Research Group,AMZN,Amazon.com Inc.,Consumer Discretionary,PIVOTAL U.S. Equity Research Internet & Media ...
1,2,20190730_FinTrust_Investment_Advisors_AMZN_Fin...,2019-07-30,FinTrust Investment Advisors,AMZN,Amazon.com Inc.,Consumer Discretionary,Special Report - Amazon.com Inc. (AMZN) FinTru...
2,3,20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...,2022-04-27,Wells Fargo,AAPL,Apple Inc.,Technology,". Equity Research Company Update — April 20, 2..."
3,4,20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf,2015-07-28,BGC Partners,AMZN,Amazon.com Inc.,Consumer Discretionary,"July 21, 2015 Colin W. Gillis Technology Analy..."
4,5,20180206_BTIG_AAPL_Apple-_Inc..pdf,2018-02-06,BTIG,AAPL,Apple Inc.,Technology,"2 0-May-2010, N/A, N/A, N/A, N/A, N/A, N/A, N/..."


In [120]:
import pdfplumber

def extract_logical_text(pdf_path):
    logical_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Get all characters
            chars = page.chars
            
            # Sort characters by their vertical positions (y-coordinate)
            sorted_chars = sorted(chars, key=lambda c: c["top"])
            
            # Group characters into logical blocks based on proximity
            lines = []
            current_line = []
            last_y = None
            
            for char in sorted_chars:
                if last_y is not None and abs(char["top"] - last_y) > 5:
                    # Start a new line if y-coordinates differ significantly
                    lines.append("".join([c["text"] for c in current_line]))
                    current_line = []
                
                current_line.append(char)
                last_y = char["top"]
            
            if current_line:
                lines.append("".join([c["text"] for c in current_line]))
            
            logical_text += "\n".join(lines) + "\n\n"
    
    return logical_text

# Path to your PDF
pdf_path = "/Users/oskarroeske/Masterthesis/preprocessing/reports/20140502_BGC_Partners_AMZN_BGC_AMZN_0425.2014.pdf"

# Extract logically grouped text
logical_text = extract_logical_text(pdf_path)

print("Logically Grouped Text:")
print(logical_text)


Logically Grouped Text:
 April 25, 2014 
 
  Colin W. Gillis 
   Technology Analyst / Director   
  cgillis@bgcpartners.com   646.346.7052  cell: 917.921.8616  BGC Financial L.P.   
    Technology Research
  
  
Amazon.com Inc.  HOLD (AMZN, $308.04) 
 
AMAZON EARNINGS REVIEW: REVENUE GROWTH SERVED WITH MINISCULE PROFITS.  
  
PRICE PERFORMANCE CHART  Haiku: The stock is trading, as if investor patience, has come to an end. 
Amazon reported March quarter results in line with expectations, and provided a June quarter 
outlook with respectable revenue guidance but disappointing operating income. The stock has 
traded down 9% on the results, building on the recent downward trend that has Amazon’s stock 
approximately $100 below its all-time high on January 22, 2014. The quarter provided metrics 
that support both the positive and negative views on the company. On the positive side, the 
company reaccelerated revenue growth to 23% YoY, and issued June quarter guidance that could 
show reven

In [119]:
import pdfplumber
def extract_text_without_tables(pdf_path):
    extracted_text = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract the main text from the page
            text = page.extract_text() or ""  # Use an empty string if no text is found
            
            # Extract tables and remove their content from the text
            for table in page.extract_tables():
                # Safely handle None values in the table
                table_data = "\n".join(
                    ["\t".join(cell if cell is not None else "" for cell in row) for row in table]
                )
                if table_data in text:
                    text = text.replace(table_data, "")  # Remove table data from text
            
            extracted_text += text + "\n\n"  # Append cleaned text
    
    return extracted_text

# Path to your PDF
pdf_path = "/Users/oskarroeske/Masterthesis/preprocessing/reports/20140502_BGC_Partners_AMZN_BGC_AMZN_0425.2014.pdf"  # Replace with your actual file path

# Extract cleaned text
cleaned_text = extract_text_without_tables(pdf_path)

# Output the result
print("Cleaned Text:")
print(cleaned_text)


Cleaned Text:
April 25, 2014
Colin W. Gillis
Technology Analyst / Director
cgillis@bgcpartners.com
646.346.7052 cell: 917.921.8616
BGC Financial L.P.
Technology Research
Amazon.com Inc. HOLD (AMZN, $308.04)
AMAZON EARNINGS REVIEW: REVENUE GROWTH SERVED WITH MINISCULE PROFITS.
PRICE PERFORMANCE CHART
Haiku: The stock is trading, as if investor patience, has come to an end.
Amazon reported March quarter results in line with expectations, and provided a June quarter
outlook with respectable revenue guidance but disappointing operating income. The stock has
traded down 9% on the results, building on the recent downward trend that has Amazon’s stock
approximately $100 below its all-time high on January 22, 2014. The quarter provided metrics
that support both the positive and negative views on the company. On the positive side, the
company reaccelerated revenue growth to 23% YoY, and issued June quarter guidance that could
show revenue growth up to 26% YoY. Countering the top line growth are

In [110]:
import re

# Define a mapping for providers and their patterns
patterns = {
    "BGC Partners": {
        "primary_price_pattern": r"Price Target \(\$\) (\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) \(\w+,"
    },
    "Needham": {
        "primary_price_pattern": r"Price Target: \$(\d+(\,\d+)?)",
        "secondary_price_pattern": r"PRICE TARGET: \$(\d+(\,\d+)?)",
        "primary_rating_pattern": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b",
        "secondary_rating_pattern": r"Rating (\w+)"
    },
    "BTIG": {
        "primary_price_pattern": r"12 month target \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b"
    },
    "Wells Fargo": {
        "primary_price_pattern": r"/\$(\d+(\.\d+)?)",
        "secondary_price_pattern": r"Price Target\/Prior: \$(\d+(\.\d+)?)",  
        "primary_rating_pattern": r"([A-Za-z]+)/\$",
        "secondary_rating_pattern": r"Rating (\w+)"
    },
    "Barclays": {
        "primary_price_pattern": r"Price Target USD (\d+(\.\d+)?)",
        "primary_rating_pattern": r"Stock Rating ([A-Za-z]+)"
    },
    "JP Morgan": {
        "primary_price_pattern": r"Price Target \([A-Za-z0-9]+\): \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) [A-Za-z]+"
    },
    "Brean Capital LLC": {
        "primary_price_pattern": r"PT: \$ (\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) PT:\$"
    },
    "Hilliard Lyons": {
        "primary_price_pattern": r"Price Target (\d+(\.\d+)?)",
        "primary_rating_pattern": r"-- ([A-Za-z]+) --"
    },
    "Alliance Global Partners": {
        "primary_price_pattern": r"Price Target \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+) \(Ticker:"
    },
    "Mizuho Securities": {
        "primary_price_pattern": r"Price Target \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"Rating ([A-Za-z]+)"
    },
    "Gilford Securities": {
        "primary_price_pattern": None, 
        "primary_rating_pattern": r"BUY OR Rated: ([A-Za-z]+)"
    },
    "Deutsche Bank": {
        "primary_price_pattern": r"Price Target \(USD\) (\d+(\.\d+)?)",
        "primary_rating_pattern": r"Rating ([A-Za-z]+)"
    },
    "Pivotal Research Group": {
        "primary_price_pattern": r"Target Price: \$(\d+(,\d+)?)",
        "primary_rating_pattern": r"RATING: ([A-Za-z]+)"
    },
    "Spartan Capital Securities LLC": {
        "primary_price_pattern": r"T \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"([A-Za-z]+)"
    },
    "Cascend Securities -Historical-": {
        "primary_price_pattern": r"Price target: \$(\d+(\.\d+)?)",
        "primary_rating_pattern": r"Rating: ([A-Za-z]+)"
    },
    "Phillip Securities": {
        "primary_price_pattern": r"TARGET PRICE USD (\d+(\.\d+)?)",
        "primary_rating_pattern": r"\b(BUY|HOLD|SELL|OVERWEIGHT|UNDERPERFORM)\b \("
    },
    "FinTrust Investment Advisors": {
        "primary_price_pattern": r"Target Price: \$(\d+(,\d+)?)",
        "primary_rating_pattern": r"Fintrust Rating: ([A-Za-z]+)"
    }
}


def extract_target_and_rating(provider, text):
 
    if provider not in patterns:
        raise ValueError(f"No patterns defined for provider: {provider}")

    primary_rating_pattern = patterns[provider].get("primary_rating_pattern")
    secondary_rating_pattern = patterns[provider].get("secondary_rating_pattern")
    primary_price_pattern = patterns[provider].get("primary_price_pattern")
    secondary_price_pattern = patterns[provider].get("secondary_price_pattern")

    # Extract rating
    rating = None
    rating_match = re.search(primary_rating_pattern, text)
    if rating_match:
        rating = rating_match.group(1)
    if rating == None:
        if secondary_rating_pattern:
            rating_match = re.search(secondary_rating_pattern,text)
            if rating_match:
                rating = rating_match.group(1)

    # Try the primary pattern
    price = None
    match = re.search(primary_price_pattern, text)
    if match:
        price = match.group(1)  # Return the first capture group

    # If no match and a secondary pattern exists, try the secondary pattern
    if price == None:
        if secondary_price_pattern:
            match = re.search(secondary_price_pattern, text)
            if match:
                price = match.group(1)  # Return the first capture group

    return rating, price

# Define a function to apply to each row
def extract_info(row):
    provider = row['provider']
    text = row['text']
    rating, price = extract_target_and_rating(provider, text)
    return pd.Series({'target_price': price, 'rating': rating})

In [111]:
# Apply the function to the dataframe
pdf_df[['target_price', 'rating']] = pdf_df.apply(extract_info, axis=1)

In [112]:
pdf_df

Unnamed: 0,ID,filename,date,provider,ticker,company_name,industry,text,target_price,rating
0,1,20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...,2020-10-01,Pivotal Research Group,AMZN,Amazon.com Inc.,Consumer Discretionary,PIVOTAL U.S. Equity Research Internet & Media ...,4500.0,BUY
1,2,20190730_FinTrust_Investment_Advisors_AMZN_Fin...,2019-07-30,FinTrust Investment Advisors,AMZN,Amazon.com Inc.,Consumer Discretionary,Special Report - Amazon.com Inc. (AMZN) FinTru...,1611.0,HOLD
2,3,20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...,2022-04-27,Wells Fargo,AAPL,Apple Inc.,Technology,". Equity Research Company Update — April 20, 2...",205.0,Overweight
3,4,20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf,2015-07-28,BGC Partners,AMZN,Amazon.com Inc.,Consumer Discretionary,"July 21, 2015 Colin W. Gillis Technology Analy...",475.0,HOLD
4,5,20180206_BTIG_AAPL_Apple-_Inc..pdf,2018-02-06,BTIG,AAPL,Apple Inc.,Technology,"2 0-May-2010, N/A, N/A, N/A, N/A, N/A, N/A, N/...",198.0,BUY
5,6,20210820_Phillip_Securities_AAPL_Apple_Inc_Sup...,2021-08-20,Phillip Securities,AAPL,Apple Inc.,Technology,"Apple Inc Super-cycle in price, margins and pr...",187.0,BUY
6,7,20220803_Phillip_Securities_AAPL_Apple_Inc_Man...,2022-08-03,Phillip Securities,AAPL,Apple Inc.,Technology,Apple Inc. Managing supply chain and FX headwi...,198.0,BUY
7,8,20150716_JP_Morgan_AAPL_Rod_Hall-s_Daily_Downl...,2015-07-16,JP Morgan,AAPL,Apple Inc.,Technology,Global Equity Research 09 July 2015 Rod Hall's...,,Global
8,9,20171205_JP_Morgan_AMZN_Holiday_eComm_Update-_...,2017-12-05,JP Morgan,AMZN,Amazon.com Inc.,Consumer Discretionary,This document is being provided for the exclus...,,This
9,10,20190731_Cascend_Securities_-Historical-_AAPL_...,2019-07-31,Cascend Securities -Historical-,AAPL,Apple Inc.,Technology,"U.S. Equities Technology – Smartphones Apple, ...",240.0,BUY


In [103]:
pdf_df.to_csv("list_of_texts.csv")

In [17]:
import re
import spacy

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    """
    Clean the extracted text by removing headers, tables, disclaimers, and other non-sentence parts.
    """
    # Remove email addresses and phone numbers
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub("This document is being provided for the exclusive use of OSKAR ROESKE at HOCHSCHULE FUER TECH & WIRT BERLIN","",text)
    text = re.sub(r"\(\d{1,3}\)[-\s]?\d{3}[-\s]?\d{4}", "", text)
    
    # Replace multiple newlines and tabs with a single space for clean sentence extraction
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

def extract_sentences(text):
    """
    Process text to split into sentences.
    """
    # Clean the text first
    cleaned_text = clean_text(text)
    
    # Parse with spaCy
    doc = nlp(cleaned_text)
    
    # Extract sentences as individual strings
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]
    
    return sentences

# Extract sentences
sentences = extract_sentences(pdf_df["text"][0])

# Display the sentences
for i, sentence in enumerate(sentences[:200]):  # Show the first 10 sentences for inspection
    print(f"Sentence {i+1}: {sentence}")


Sentence 1: PIVOTAL U.S. Equity Research Internet & Media Pivotal Research Group AMZN:
Sentence 2: We Think the Street – Buy- and Sell-side are September 30, 2020 Looking at SOTP Wrong (Us Included).
Sentence 3: We are Reframing Why AMZN is the Best Mega-cap on a Multi-year Basis.
Sentence 4: BOTTOM LINE:
Sentence 5: We and almost every other investor we have spoken to over the years, has been MICHAEL LEVINE framing the AMZN SOTP valuation wrong.
Sentence 6: Amazon advertising is only ~5% of revenues, but is far greater 212-514-4682 contributor to overall non-AWS EBIT margins than the street recognizes.
Sentence 7: Said differently, If advertising was viewed as a stand-along business unit (we will explain why it shouldn’t be), it would represent well north of 300% of 2020E non-AWS EBIT.
Sentence 8: Amazon (AMZN) Based on our view that there is massive upside by 2024E, we increase our PT to a Street high of $4,500 based on an average of our 2024 SOTP and 30x our 2024 EBIT “power” of $91

In [7]:
print(pdf_df)
pdf_df.to_csv("list_of_texts.csv")

                                             filename  \
0   20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...   
1   20190730_FinTrust_Investment_Advisors_AMZN_Fin...   
2   20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...   
3     20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf   
4                  20180206_BTIG_AAPL_Apple-_Inc..pdf   
5   20210820_Phillip_Securities_AAPL_Apple_Inc_Sup...   
6   20220803_Phillip_Securities_AAPL_Apple_Inc_Man...   
7   20150716_JP_Morgan_AAPL_Rod_Hall-s_Daily_Downl...   
8   20171205_JP_Morgan_AMZN_Holiday_eComm_Update-_...   
9   20190731_Cascend_Securities_-Historical-_AAPL_...   
10  20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...   
11  20161017_JP_Morgan_AAPL_Rod_Hall-s_Daily_Downl...   
12  20160510_Phillip_Securities_AAPL_Apple_Inc._Th...   
13  20170804_Needham_AMZN_Mixed_Q1-_Investors_Shou...   
14   20141028_BGC_Partners_AAPL_BGC_AAPL_10212014.pdf   
15                 20190107_BTIG_AAPL_Apple-_Inc..pdf   
16   20160803_BGC_Partners_AAPL