In [18]:
!pip install pdfplumber spacy https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [23]:
import os
import re
import pdfplumber
import pandas as pd
import spacy
from datetime import datetime

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load the provider info CSV for ending keywords
provider_info = pd.read_csv('provider.csv')

# Load the company info CSV for ticker validation and company metadata
company_info = pd.read_csv('company_info.csv')  # Replace with the actual path

# Drop duplicates in 'Ticker Symbol' to ensure each ticker has a unique entry
company_info = company_info.drop_duplicates(subset=['Ticker Symbol'])

# Create a dictionary to map ticker symbols to company name and industry
ticker_map = company_info.set_index('Ticker Symbol')[['Company Name', 'Industry']].to_dict(orient='index')

# Define function to extract metadata from filename
def extract_metadata(filename):
    # Extract date and possible provider + ticker section
    match = re.match(r"(\d{8})_([\w_]+)_([A-Z]+)_", filename)
    if match:
        date_str, provider_raw, possible_ticker = match.groups()
        # Convert date to datetime format
        date = datetime.strptime(date_str, "%Y%m%d")
        
        # Check if the possible_ticker is in the ticker map
        if possible_ticker in ticker_map:
            provider = provider_raw.replace('_', ' ')  # Provider name with spaces
            company_name = ticker_map[possible_ticker]['Company Name']
            industry = ticker_map[possible_ticker]['Industry']
            ticker = possible_ticker
            return date, provider, ticker, company_name, industry
        else:
            return date, provider_raw.replace('_', ' '), None, None, None  # If no ticker match, return None for ticker-related fields
    return None, None, None, None, None

# Define function to clean text by removing content after ending keywords
def remove_after_ending_keyword(text, ending_keyword):
    if ending_keyword:
        # Find the position of the ending keyword in the text
        end_pos = text.lower().find(ending_keyword.lower())
        if end_pos != -1:
            # Return text up to the ending keyword
            return text[:end_pos]
    return text

# Define function to extract text from a single PDF
def extract_text_from_pdf(file_path):
    """Extract text from a single PDF file."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Ensure the page has text
                text += page_text + "\n"
    return text

# Define function to process all PDFs in the directory and store data in DataFrame
def extract_text_from_all_pdfs_to_dataframe(directory_path, provider_info, ticker_map):
    """Extract text from all PDFs in a specified directory and store in a DataFrame with unique ID."""
    data = []
    id_counter = 1  # Initialize an ID counter
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):  # Process only PDF files
            file_path = os.path.join(directory_path, filename)
            
            # Extract metadata from filename
            date, provider, ticker, company_name, industry = extract_metadata(filename)
            
            # Get the ending keyword for the provider, if available
            ending_keyword = provider_info.loc[
                provider_info['file_name'] == provider, 'Ending'
            ].values
            ending_keyword = ending_keyword[0] if len(ending_keyword) > 0 else None
            
            # Extract and clean text from PDF
            text = extract_text_from_pdf(file_path)
            cleaned_text = remove_after_ending_keyword(text, ending_keyword)
            
            # Add extracted data to the list
            data.append({
                "ID": id_counter,  # Unique ID
                "filename": filename,
                "date": date,
                "provider": provider,
                "ticker": ticker,
                "company_name": company_name,
                "industry": industry,
                "text": cleaned_text
            })
            id_counter += 1  # Increment the ID counter for the next row

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    return df

# Directory path
pdf_directory = "../preprocessing/reports"  # Replace with your actual folder path

# Run the function and store results in a DataFrame
pdf_df = extract_text_from_all_pdfs_to_dataframe(pdf_directory, provider_info, ticker_map)

# Display the resulting DataFrame to confirm
pdf_df.head()




Unnamed: 0,ID,filename,date,provider,ticker,company_name,industry,text
0,1,20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...,2020-10-01,Pivotal Research Group,AMZN,Amazon.com Inc.,Consumer Discretionary,PIVOTAL\nU.S. Equity Research\nInternet & Medi...
1,2,20190730_FinTrust_Investment_Advisors_AMZN_Fin...,2019-07-30,FinTrust Investment Advisors,AMZN,Amazon.com Inc.,Consumer Discretionary,Special Report - Amazon.com Inc. (AMZN) FinTru...
2,3,20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...,2022-04-27,Wells Fargo,AAPL,Apple Inc.,Technology,This document is being provided for the exclus...
3,4,20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf,2015-07-28,BGC Partners AMZN BGC,AMZN,Amazon.com Inc.,Consumer Discretionary,"July 21, 2015\nColin W. Gillis\nTechnology Ana..."
4,5,20180206_BTIG_AAPL_Apple-_Inc..pdf,2018-02-06,BTIG,AAPL,Apple Inc.,Technology,"2 0-May-2010, N/A, N/A, N/A, N/A, N/A, N/A, N/..."


In [17]:
import re
import spacy

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    """
    Clean the extracted text by removing headers, tables, disclaimers, and other non-sentence parts.
    """
    # Remove specific known noise patterns (these may vary depending on your data)
    # Example: Remove 'North America Equity Research' or contact information patterns
    text = re.sub(r"North America Equity Research|[A-Z][a-z]+\s\w+ Equity Research|See page\s+\d+|This document.*?\n", "", text)
    
    # Remove email addresses and phone numbers
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"\(\d{1,3}\)[-\s]?\d{3}[-\s]?\d{4}", "", text)
    
    # Remove common footer elements or disclaimers that span multiple lines
    text = re.sub(r"(?s)Important Disclosures.*?(?=J\.P\. Morgan)", "", text)
    
    # Replace multiple newlines and tabs with a single space for clean sentence extraction
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

def extract_sentences(text):
    """
    Process text to split into sentences.
    """
    # Clean the text first
    cleaned_text = clean_text(text)
    
    # Parse with spaCy
    doc = nlp(cleaned_text)
    
    # Extract sentences as individual strings
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]
    
    return sentences

# Extract sentences
sentences = extract_sentences(pdf_df["text"][0])

# Display the sentences
for i, sentence in enumerate(sentences[:200]):  # Show the first 10 sentences for inspection
    print(f"Sentence {i+1}: {sentence}")


Sentence 1: PIVOTAL U.S. Equity Research Internet & Media Pivotal Research Group AMZN:
Sentence 2: We Think the Street – Buy- and Sell-side are September 30, 2020 Looking at SOTP Wrong (Us Included).
Sentence 3: We are Reframing Why AMZN is the Best Mega-cap on a Multi-year Basis.
Sentence 4: BOTTOM LINE:
Sentence 5: We and almost every other investor we have spoken to over the years, has been MICHAEL LEVINE framing the AMZN SOTP valuation wrong.
Sentence 6: Amazon advertising is only ~5% of revenues, but is far greater 212-514-4682 contributor to overall non-AWS EBIT margins than the street recognizes.
Sentence 7: Said differently, If advertising was viewed as a stand-along business unit (we will explain why it shouldn’t be), it would represent well north of 300% of 2020E non-AWS EBIT.
Sentence 8: Amazon (AMZN) Based on our view that there is massive upside by 2024E, we increase our PT to a Street high of $4,500 based on an average of our 2024 SOTP and 30x our 2024 EBIT “power” of $91

In [7]:
print(pdf_df)
pdf_df.to_csv("list_of_texts.csv")

                                             filename  \
0   20201001_Pivotal_Research_Group_AMZN_AMZN-_We_...   
1   20190730_FinTrust_Investment_Advisors_AMZN_Fin...   
2   20220427_Wells_Fargo_AAPL_AAPL-_F2Q22_Preview_...   
3     20150728_BGC_Partners_AMZN_BGC_AMZN_0721015.pdf   
4                  20180206_BTIG_AAPL_Apple-_Inc..pdf   
5   20210820_Phillip_Securities_AAPL_Apple_Inc_Sup...   
6   20220803_Phillip_Securities_AAPL_Apple_Inc_Man...   
7   20150716_JP_Morgan_AAPL_Rod_Hall-s_Daily_Downl...   
8   20171205_JP_Morgan_AMZN_Holiday_eComm_Update-_...   
9   20190731_Cascend_Securities_-Historical-_AAPL_...   
10  20221230_JP_Morgan_AAPL_Apple-_Public_CAICT_Ch...   
11  20161017_JP_Morgan_AAPL_Rod_Hall-s_Daily_Downl...   
12  20160510_Phillip_Securities_AAPL_Apple_Inc._Th...   
13  20170804_Needham_AMZN_Mixed_Q1-_Investors_Shou...   
14   20141028_BGC_Partners_AAPL_BGC_AAPL_10212014.pdf   
15                 20190107_BTIG_AAPL_Apple-_Inc..pdf   
16   20160803_BGC_Partners_AAPL