In [2]:
import pandas as pd
import spacy
from tqdm import tqdm

In [3]:
df_appl = pd.read_csv("Union_AAPL.csv")

In [25]:
df_tsla.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79506 entries, 0 to 79505
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           79506 non-null  int64 
 1   date            79506 non-null  object
 2   article_title   79506 non-null  object
 3   stock_symbol    79506 non-null  object
 4   fuzzy_80_label  79506 non-null  object
 5   hard_label      79506 non-null  object
 6   spacy_label     79506 non-null  object
dtypes: int64(1), object(6)
memory usage: 4.2+ MB


In [7]:
# --- Load spaCy model ---
nlp = spacy.load("en_core_web_sm", disable=["textcat"])

# --- Define stock-specific keywords ---
spacy_keywords = {
    "AAPL": [
        "AAPL", "Apple", "Apple Inc", "Steve Jobs", "Tim Cook", "Steve Wozniak",
        "iPhone", "iPad", "Mac", "Apple Watch", "Apple Music", "Apple TV"
    ]
}

ENTITY_TYPES = {"ORG", "PERSON", "PRODUCT"}
TEXT_COLUMN = "article_title"
INDEX_COLUMN = "index"
CHUNKSIZE = 10_000

# --- Run on the in-memory df_appl ---
stock = "AAPL"
print(f"\n🔍 Running spaCy NER for {stock}")
matches = set()

# Process in chunks for memory efficiency
for start in tqdm(range(0, len(df_appl), CHUNKSIZE), desc=stock):
    end = min(start + CHUNKSIZE, len(df_appl))
    chunk = df_appl.iloc[start:end][[INDEX_COLUMN, TEXT_COLUMN]].dropna()

    docs = list(nlp.pipe(chunk[TEXT_COLUMN].tolist(), batch_size=128))

    for doc, idx in zip(docs, chunk[INDEX_COLUMN]):
        for ent in doc.ents:
            if ent.label_ in ENTITY_TYPES and ent.text in spacy_keywords[stock]:
                matches.add(idx)
                break

# Assign labels to df_appl
df_appl["spacy_label"] = df_appl[INDEX_COLUMN].apply(lambda i: stock if i in matches else "other")



🔍 Running spaCy NER for AAPL


AAPL: 100%|█████████████████████████████████████| 87/87 [32:14<00:00, 22.24s/it]


In [10]:
df_appl.head()

Unnamed: 0,index,date,article_title,stock_symbol,fuzzy_80_label,hard_label,spacy_label
0,13,2023-11-21 00:00:00+00:00,A Makes Bullish Cross Above Critical Moving Av...,other,AAPL,other,other
1,36,2023-09-25 00:00:00+00:00,"After Hours Most Active for Sep 25, 2023 : OKE...",other,AAPL,AAPL,other
2,64,2023-08-15 00:00:00+00:00,"Markets Remain Sold; H&R Block (HRB), Agilent ...",other,AAPL,other,other
3,73,2023-08-11 00:00:00+00:00,Agilent Technologies (A) to Post Q3 Earnings: ...,other,AAPL,other,other
4,92,2023-06-02 00:00:00+00:00,"After Hours Most Active for Jun 2, 2023 : UPWK...",other,AAPL,AAPL,other


In [11]:
df_appl.to_csv("AAPL_all_labels.csv", index=False)

## Generic Function

In [9]:
import pandas as pd
import spacy
from tqdm import tqdm

# Load spaCy model once
nlp = spacy.load("en_core_web_sm")


# Keywords per stock for NER
spacy_keywords = {
    "AAPL": ["AAPL", "Apple", "Apple Inc", "Steve Jobs", "Tim Cook", "Steve Wozniak",
             "iPhone", "iPad", "Mac", "Apple Watch", "Apple Music", "Apple TV", "iOS"],
    "MSFT": ["MSFT", "Microsoft", "Satya Nadella", "Bill Gates", "Paul Allen",
             "Xbox", "LinkedIn", "Visual Studio", "OneDrive", "Skype", "GitHub", "SharePoint"],
    "GOOGL": ["GOOGL", "GOOG", "Google", "Alphabet", "Larry Page", "Sergey Brin",
              "Sundar Pichai", "YouTube", "Gmail", "Android", "Chrome", "Google Maps", "Google Drive"],
    "AMZN": ["AMZN", "Amazon", "Amazon.com", "Jeff Bezos", "Bezos", "Alexa", "Kindle",
             "Twitch", "Prime Video", "Audible", "MGM Studios", "Fire Tablet"],
    "NVDA": ["NVDA", "Nvidia", "Jensen Huang", "Bill Dally", "GeForce", "CUDA", "RTX",
             "NVIDIA Drive", "NVIDIA Jetson", "Blackwell"],
    "META": ["META", "Meta", "Meta Platforms", "Facebook", "Mark Zuckerberg", "Zuckerberg",
             "Instagram", "Messenger", "WhatsApp", "Threads", "Meta Quest"],
    "TSLA": ["TSLA", "Tesla", "Elon Musk", "Musk", "Model 3", "Model S", "Model X",
             "Cybertruck", "Tesla Semi", "Roadster", "GigaFactory", "Supercharger"]
}

# Entity types we care about
ENTITY_TYPES = {"ORG", "PERSON", "PRODUCT"}

def run_spacy_ner_on_df(df, stock, text_column="article_title", index_column="index", chunksize=500_000):
    """
    Run spaCy NER on a stock-specific DataFrame and append a 'spacy_label' column
    based on matching known keywords for ORG, PERSON, PRODUCT.

    Parameters:
        df (pd.DataFrame): Input DataFrame with article titles.
        stock (str): Stock ticker (e.g., "AAPL", "TSLA").
        text_column (str): Column name for text.
        index_column (str): Unique index column name.
        chunksize (int): Chunk size for processing.

    Returns:
        pd.DataFrame: Updated DataFrame with new 'spacy_label' column.
    """
    print(f"\n🔍 Running spaCy NER for {stock}")
    matches = set()
    keywords = spacy_keywords[stock]

    for start in tqdm(range(0, len(df), chunksize), desc=f"{stock} NER"):
        end = min(start + chunksize, len(df))
        chunk = df.iloc[start:end][[index_column, text_column]].dropna()

        docs = list(nlp.pipe(chunk[text_column].tolist(), batch_size=128))

        for doc, idx in zip(docs, chunk[index_column]):
            for ent in doc.ents:
                if ent.label_ in ENTITY_TYPES and ent.text in keywords:
                    matches.add(idx)
                    break  # one match is enough per doc

    # Assign label
    df["spacy_label"] = df[index_column].apply(lambda i: stock if i in matches else "other")
    return df


In [10]:
df_aapl = pd.read_csv("/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/AAPL_KBL.csv")
df_msft = pd.read_csv("/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/MSFT_KBL.csv")
df_googl = pd.read_csv("/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/GOOGL_KBL.csv")
df_amzn = pd.read_csv("/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/AMZN_KBL.csv")
df_nvda = pd.read_csv("/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/NVDA_KBL.csv")
df_meta = pd.read_csv("/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/META_KBL.csv")
df_tsla = pd.read_csv("/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Auxiliary Data/TSLA_KBL.csv")

In [7]:
df_msft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473942 entries, 0 to 473941
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   index           473942 non-null  int64 
 1   date            473942 non-null  object
 2   article_title   473942 non-null  object
 3   stock_symbol    473942 non-null  object
 4   fuzzy_85_label  473942 non-null  object
 5   harder_label    473942 non-null  object
dtypes: int64(1), object(5)
memory usage: 21.7+ MB


In [13]:
df_aapl = run_spacy_ner_on_df(df_aapl, stock="AAPL", chunksize = 10_000)
df_msft = run_spacy_ner_on_df(df_msft, stock="MSFT", chunksize = 10_000)
df_googl = run_spacy_ner_on_df(df_googl, stock="GOOGL", chunksize = 10_000)
df_amzn = run_spacy_ner_on_df(df_amzn, stock="AMZN", chunksize = 5_000)
df_nvda = run_spacy_ner_on_df(df_nvda, stock="NVDA", chunksize = 10_000)
df_meta = run_spacy_ner_on_df(df_meta, stock="META", chunksize = 5_000)
df_tsla = run_spacy_ner_on_df(df_tsla, stock="TSLA", chunksize = 10_000)


🔍 Running spaCy NER for AAPL


AAPL NER: 100%|█████████████████████████████████| 20/20 [06:51<00:00, 20.59s/it]



🔍 Running spaCy NER for MSFT


MSFT NER: 100%|█████████████████████████████████| 48/48 [15:37<00:00, 19.53s/it]



🔍 Running spaCy NER for GOOGL


GOOGL NER: 100%|████████████████████████████████| 10/10 [04:27<00:00, 26.77s/it]



🔍 Running spaCy NER for AMZN


AMZN NER: 100%|█████████████████████████████████| 16/16 [03:07<00:00, 11.73s/it]



🔍 Running spaCy NER for NVDA


NVDA NER: 100%|█████████████████████████████████| 12/12 [04:45<00:00, 23.75s/it]



🔍 Running spaCy NER for META


META NER: 100%|█████████████████████████████████| 15/15 [02:36<00:00, 10.42s/it]



🔍 Running spaCy NER for TSLA


TSLA NER: 100%|█████████████████████████████████| 10/10 [03:55<00:00, 23.51s/it]


In [14]:
df_aapl.to_csv("AAPL_complete_ms.csv", index=False)
df_msft.to_csv("MSFT_complete_ms.csv", index=False)
df_googl.to_csv("GOOGL_complete_ms.csv", index=False)
df_amzn.to_csv("AMZN_complete_ms.csv", index=False)
df_nvda.to_csv("NVDA_complete_ms.csv", index=False)
df_meta.to_csv("META_complete_ms.csv", index=False)
df_tsla.to_csv("TSLA_complete_ms.csv", index=False)

In [39]:
import os
print(os.getcwd())

/Users/williamnordansjo/DABEN/DABN01 - Masters' Thesis/Notebooks
