In [11]:
import pandas as pd

def load_large_csv_in_chunks(
    file_path: str,
    usecols: list[str] = None,
    filter_col: str = None,
    filter_values: list[str] = None,
    chunksize: int = 100_000,
    dropna_cols: list[str] = None
) -> pd.DataFrame:
    """
    Loads a large CSV file in chunks and returns a concatenated DataFrame.
    
    Parameters:
    - file_path (str): Path to the CSV file.
    - usecols (list[str], optional): Columns to load.
    - filter_col (str, optional): Column to apply filtering on.
    - filter_values (list[str], optional): Values to keep in filter_col.
    - chunksize (int): Number of rows per chunk.
    - dropna_cols (list[str], optional): Drop rows with NaN in these columns.
    
    Returns:
    - pd.DataFrame: Filtered and loaded data in memory.
    """
    
    reader = pd.read_csv(file_path, usecols=usecols, chunksize=chunksize)
    chunks = []

    for i, chunk in enumerate(reader):
        print(f"🔄 Processing chunk {i + 1}")
        
        if dropna_cols:
            chunk = chunk.dropna(subset=dropna_cols)
        
        if filter_col and filter_values:
            chunk = chunk[chunk[filter_col].isin(filter_values)]
        
        chunks.append(chunk)
    
    df = pd.concat(chunks, ignore_index=True)
    print(f"✅ Loaded {len(df):,} rows into memory.")
    return df


In [2]:
file_path = "/Volumes/T7/External Downloads/nasdaq_titles_fuzzy_rdy.csv"

df = load_large_csv_in_chunks(
    usecols= ["date", "article_title", "article_title_clean", "stock_symbol"],
    file_path=file_path,
    chunksize = 100_000
)

🔄 Processing chunk 1
🔄 Processing chunk 2
🔄 Processing chunk 3
🔄 Processing chunk 4
🔄 Processing chunk 5
🔄 Processing chunk 6
🔄 Processing chunk 7
🔄 Processing chunk 8
🔄 Processing chunk 9
🔄 Processing chunk 10
🔄 Processing chunk 11
🔄 Processing chunk 12
🔄 Processing chunk 13
🔄 Processing chunk 14
🔄 Processing chunk 15
🔄 Processing chunk 16
🔄 Processing chunk 17
🔄 Processing chunk 18
🔄 Processing chunk 19
🔄 Processing chunk 20
🔄 Processing chunk 21
🔄 Processing chunk 22
🔄 Processing chunk 23
🔄 Processing chunk 24
🔄 Processing chunk 25
🔄 Processing chunk 26
🔄 Processing chunk 27
🔄 Processing chunk 28
🔄 Processing chunk 29
🔄 Processing chunk 30
🔄 Processing chunk 31
🔄 Processing chunk 32
🔄 Processing chunk 33
🔄 Processing chunk 34
🔄 Processing chunk 35
🔄 Processing chunk 36
🔄 Processing chunk 37
🔄 Processing chunk 38
🔄 Processing chunk 39
🔄 Processing chunk 40
🔄 Processing chunk 41
🔄 Processing chunk 42
🔄 Processing chunk 43
🔄 Processing chunk 44
🔄 Processing chunk 45
🔄 Processing chunk 

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15549298 entries, 0 to 15549297
Data columns (total 6 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   level_0              int64 
 1   index                int64 
 2   date                 object
 3   article_title        object
 4   stock_symbol         object
 5   article_title_clean  object
dtypes: int64(2), object(4)
memory usage: 711.8+ MB


In [17]:
df = df.drop(columns=["level_0"])

NameError: name 'df_head' is not defined

In [24]:
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz

# --- Define your fuzzy keyword dictionary ---
fuzzy_keywords = {
    "AAPL": [
        "aapl", "apple", "apple inc", "steve jobs", "tim cook", "ipad", "iphone", "mac ", "ios ", "macintosh",
        "airpods", "apple watch", "apple tv", "apple card", "apple pay",
        "icloud", "app store", "apple music", "wozniak", "steve wozniak", "magnificent 7",
        "magnificent seven", "mag7", "faang"
    ],
    "MSFT": [
        "msft", "microsoft", "microsoft office", "windows", "azure", "xbox", "bing", "linkedin",
        "visual studio", "microsoft teams", "microsoft 365", "microsoft dynamics", "skype",
        "onedrive", "github", "sharepoint", "microsoft viva", "viva engage",
        "satya nadella", "bill gates", "paul allen", "magnificent 7", "magnificent seven", "mag7"
    ],
    "GOOGL": [
        "googl", "goog", "google", "alphabet", "youtube", "gmail", "android", "chrome", "google maps",
        "google cloud", "google drive", "abc.xyz", "larry page", "sergey brin",
        "sundar pichai", "ruth porat", "hennessy", "ashkenazi", "magnificent 7", "magnificent seven", "mag7", "faang" 
    ],
    "AMZN": [
        "amzn", "amazon", "amazon.com", " aws ", "alexa", "kindle", "amazon echo", "amazon prime", "ec2",
        "prime video", "twitch", "audible", "metro goldwyn mayer", "mgm studios", "fire tablet",
        "jeff bezos", "bezos", "magnificent 7", "magnificent seven", "mag7", "faang"
    ],
    "NVDA": [
        "nvda", "nvidia", "geforce", "geforce now", "cuda", "nvidia rtx", "gtc", "blackwell",
        "nvidia drive", "nvidia jetson", "nvidia isaac", "tegra", "quantum computing",
        "jensen huang", "bill dally", "magnificent 7", "magnificent seven", "mag7"
    ],
    "META": [
        "meta ", "meta platforms", "facebook", "instagram", "whatsapp", "threads",
        "messenger", "zuckerberg", "mark zuckerberg", "meta quest", "metaverse",
        "the facebook inc", "magnificent 7", "magnificent seven", "mag7", "faang"
    ],
    "TSLA": [
        "tsla", "tesla", "elon musk", "musk", "model 3", "model s ", "model x ", "cybertruck",
        "powerwall", "megapack", "solar city", "tesla semi", "supercharger",
        "roadster", "solarcity", "electric vehicle", "gigafactory", "magnificent 7", "magnificent seven", "mag7"
    ]
}



# --- Settings ---
FUZZY_THRESHOLD = 90
CHUNKSIZE = 500_000  # adapt based on available RAM
TEXT_COLUMN = 'article_title_clean'
INDEX_COLUMN = 'index'

# --- Make sure index is set ---
#df = df.reset_index(drop=False)

# --- Loop through each stock individually ---
for stock, keywords in fuzzy_keywords.items():
    print(f"\n🔍 Labeling for: {stock}")
    matches = []

    # Process in chunks
    for start in tqdm(range(0, len(df), CHUNKSIZE), desc=f"{stock}"):
        end = min(start + CHUNKSIZE, len(df))
        chunk = df.iloc[start:end]

        for idx, text in zip(chunk[INDEX_COLUMN], chunk[TEXT_COLUMN]):
            if pd.isna(text): continue

            for keyword in keywords:
                score = fuzz.partial_ratio(keyword, text)
                if score >= FUZZY_THRESHOLD:
                    matches.append({
                        "index": idx,
                        "fuzzy_90_label": stock
                    })
                    break  # avoid double-labeling the same article for this stock

    # Save to CSV
    matches_df = pd.DataFrame(matches)
    output_path = f"fuzzy_keywords_90_labels_{stock}.csv"
    matches_df.to_csv(output_path, index=False)
    print(f"✅ Saved fuzzy labels for {stock} to {output_path}")



🔍 Labeling for: AAPL


AAPL: 100%|█████████████████████████████████████| 32/32 [23:29<00:00, 44.05s/it]


✅ Saved fuzzy labels for AAPL to fuzzy_keywords_90_labels_AAPL.csv

🔍 Labeling for: MSFT


MSFT: 100%|█████████████████████████████████████| 32/32 [24:55<00:00, 46.75s/it]


✅ Saved fuzzy labels for MSFT to fuzzy_keywords_90_labels_MSFT.csv

🔍 Labeling for: GOOGL


GOOGL: 100%|████████████████████████████████████| 32/32 [18:09<00:00, 34.04s/it]


✅ Saved fuzzy labels for GOOGL to fuzzy_keywords_90_labels_GOOGL.csv

🔍 Labeling for: AMZN


AMZN: 100%|█████████████████████████████████████| 32/32 [16:36<00:00, 31.16s/it]


✅ Saved fuzzy labels for AMZN to fuzzy_keywords_90_labels_AMZN.csv

🔍 Labeling for: NVDA


NVDA: 100%|█████████████████████████████████████| 32/32 [14:26<00:00, 27.06s/it]


✅ Saved fuzzy labels for NVDA to fuzzy_keywords_90_labels_NVDA.csv

🔍 Labeling for: META


META: 100%|█████████████████████████████████████| 32/32 [13:01<00:00, 24.42s/it]


✅ Saved fuzzy labels for META to fuzzy_keywords_90_labels_META.csv

🔍 Labeling for: TSLA


TSLA: 100%|█████████████████████████████████████| 32/32 [16:29<00:00, 30.92s/it]


✅ Saved fuzzy labels for TSLA to fuzzy_keywords_90_labels_TSLA.csv


In [20]:
import pandas as pd
from tqdm import tqdm

# --- Hard match keywords (reuse fuzzy dict but with capitalization) ---
hard_keywords = {
    "AAPL": [
        "AAPL", "Apple", "Apple Inc", "Steve Jobs", "Tim Cook", "iPad", "iPhone", "Mac ", "iOS ", "Macintosh",
        "AirPods", "Apple Watch", "Apple TV", "Apple Card", "Apple Pay",
        "iCloud", "App Store", "Apple Music", "Wozniak", "Steve Wozniak", "Magnificent 7",
        "Magnificent Seven", "MAG7", "FAANG"
    ],
    "MSFT": [
        "MSFT", "Microsoft", "Microsoft Office", "Windows", "Azure", "Xbox", "Bing", "LinkedIn",
        "Visual Studio", "Microsoft Teams", "Microsoft 365", "Microsoft Dynamics", "Skype",
        "OneDrive", "GitHub", "SharePoint", "Microsoft Viva", "Viva Engage",
        "Satya Nadella", "Bill Gates", "Paul Allen", "Magnificent 7", "Magnificent Seven", "MAG7"
    ],
    "GOOGL": [
        "GOOGL", "GOOG", "Google", "Alphabet", "YouTube", "Gmail", "Android", "Chrome", "Google Maps",
        "Google Cloud", "Google Drive", "abc.xyz", "Larry Page", "Sergey Brin",
        "Sundar Pichai", "Ruth Porat", "Hennessy", "Ashkenazi", "Magnificent 7", "Magnificent Seven", "MAG7", "FAANG" 
    ],
    "AMZN": [
        "AMZN", "Amazon", "Amazon.com", " AWS ", "Alexa", "Kindle", "Amazon Echo", "Amazon Prime", "EC2",
        "Prime Video", "Twitch", "Audible", "Metro Goldwyn Mayer", "MGM Studios", "Fire Tablet",
        "Jeff Bezos", "Bezos", "Magnificent 7", "Magnificent Seven", "MAG7", "FAANG"
    ],
    "NVDA": [
        "NVDA", "NVIDIA", "Nvidia", "GeForce", "GeForce NOW", "CUDA", "NVIDIA RTX", "GTC", "Blackwell",
        "NVIDIA DRIVE", "NVIDIA Jetson", "NVIDIA Isaac", "Tegra", "Quantum Computing",
        "Jensen Huang", "Bill Dally", "Magnificent 7", "Magnificent Seven", "MAG7"
    ],
    "META": [
        "META", "Meta ", "Meta Platforms", "Facebook", "Instagram", "WhatsApp", "Threads",
        "Messenger", "Zuckerberg", "Mark Zuckerberg", "Meta Quest", "Metaverse",
        "The Facebook Inc", "The Facebook", "facebook", "Magnificent 7", "Magnificent Seven", "MAG7", "FAANG"
    ],
    "TSLA": [
        "TSLA", "Tesla", "Elon Musk", "Musk", "Model 3", "Model S", "Model X", "Cybertruck",
        "Powerwall", "Megapack", "Solar City", "Tesla Semi", "Supercharger",
        "Roadster", "SolarCity", "EV ", "Electric Vehicle", "GigaFactory", "Magnificent 7", "Magnificent Seven", "MAG7"
    ]
}



# --- Settings ---
CHUNKSIZE = 500_000
TEXT_COLUMN = 'article_title'
INDEX_COLUMN = 'index'

# --- Make sure index exists ---
#df = df.reset_index(drop=False)

# --- Loop through each stock and match keywords ---
for stock, keywords in hard_keywords.items():
    print(f"\n🔍 Harder-labeling for: {stock}")
    matches = []

    for start in tqdm(range(0, len(df), CHUNKSIZE), desc=f"{stock}"):
        end = min(start + CHUNKSIZE, len(df))
        chunk = df.iloc[start:end]

        for idx, text in zip(chunk[INDEX_COLUMN], chunk[TEXT_COLUMN]):
            if pd.isna(text): continue

            if any(keyword in text for keyword in keywords):
                matches.append({
                    "index": idx,
                    "harder_label": stock
                })

    # Save to CSV
    matches_df = pd.DataFrame(matches)
    output_path = f"harder_label_{stock}.csv"
    matches_df.to_csv(output_path, index=False)
    print(f"✅ Saved hard keyword labels for {stock} to {output_path}")



🔍 Harder-labeling for: AAPL


AAPL: 100%|█████████████████████████████████████| 32/32 [01:35<00:00,  3.00s/it]


✅ Saved hard keyword labels for AAPL to harder_label_AAPL.csv

🔍 Harder-labeling for: MSFT


MSFT: 100%|█████████████████████████████████████| 32/32 [01:14<00:00,  2.34s/it]


✅ Saved hard keyword labels for MSFT to harder_label_MSFT.csv

🔍 Harder-labeling for: GOOGL


GOOGL: 100%|████████████████████████████████████| 32/32 [01:20<00:00,  2.51s/it]


✅ Saved hard keyword labels for GOOGL to harder_label_GOOGL.csv

🔍 Harder-labeling for: AMZN


AMZN: 100%|█████████████████████████████████████| 32/32 [01:15<00:00,  2.35s/it]


✅ Saved hard keyword labels for AMZN to harder_label_AMZN.csv

🔍 Harder-labeling for: NVDA


NVDA: 100%|█████████████████████████████████████| 32/32 [01:04<00:00,  2.02s/it]


✅ Saved hard keyword labels for NVDA to harder_label_NVDA.csv

🔍 Harder-labeling for: META


META: 100%|█████████████████████████████████████| 32/32 [01:05<00:00,  2.04s/it]


✅ Saved hard keyword labels for META to harder_label_META.csv

🔍 Harder-labeling for: TSLA


TSLA: 100%|█████████████████████████████████████| 32/32 [01:14<00:00,  2.33s/it]


✅ Saved hard keyword labels for TSLA to harder_label_TSLA.csv


In [29]:
import pandas as pd
from tqdm import tqdm

# --- Hard match keywords (reuse fuzzy dict but with capitalization) ---
hard_keywords = {
    "NVDA": [
        "NVDA", "NVIDIA", "Nvidia", "GeForce", "GeForce NOW", "CUDA", "NVIDIA RTX", "GTC", "Blackwell",
        "NVIDIA DRIVE", "NVIDIA Jetson", "NVIDIA Isaac", "Tegra", "Quantum Computing",
        "Jensen Huang", "Bill Dally", "Magnificent 7", "Magnificent Seven", "MAG7"
    ]
}

# --- Settings ---
CHUNKSIZE = 500_000
TEXT_COLUMN = 'article_title'
INDEX_COLUMN = 'index'

# --- Make sure index exists ---
#df = df.reset_index(drop=False)

# --- Loop through each stock and match keywords ---
for stock, keywords in hard_keywords.items():
    print(f"\n🔍 Harder-labeling for: {stock}")
    matches = []

    for start in tqdm(range(0, len(df), CHUNKSIZE), desc=f"{stock}"):
        end = min(start + CHUNKSIZE, len(df))
        chunk = df.iloc[start:end]

        for idx, text in zip(chunk[INDEX_COLUMN], chunk[TEXT_COLUMN]):
            if pd.isna(text): continue

            if any(keyword in text for keyword in keywords):
                matches.append({
                    "index": idx,
                    "harder_label": stock
                })

    # Save to CSV
    matches_df = pd.DataFrame(matches)
    output_path = f"harder_fixed_label_{stock}.csv"
    matches_df.to_csv(output_path, index=False)
    print(f"✅ Saved hard keyword labels for {stock} to {output_path}")



🔍 Harder-labeling for: NVDA


NVDA: 100%|█████████████████████████████████████| 32/32 [05:01<00:00,  9.43s/it]


✅ Saved hard keyword labels for NVDA to harder_fixed_label_NVDA.csv


In [23]:
"tesla" in fuzzy_keywords["TSLA"]


False

In [25]:
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz

# --- Define your fuzzy keyword dictionary ---
fuzzy_keywords = {
    "AAPL": [
        "aapl", "apple", "apple inc", "steve jobs", "tim cook", "ipad", "iphone", "mac ", "ios ", "macintosh",
        "airpods", "apple watch", "apple tv", "apple card", "apple pay",
        "icloud", "app store", "apple music", "wozniak", "steve wozniak", "magnificent 7",
        "magnificent seven", "mag7", "faang"
    ],
    "MSFT": [
        "msft", "microsoft", "microsoft office", "windows", "azure", "xbox", "bing", "linkedin",
        "visual studio", "microsoft teams", "microsoft 365", "microsoft dynamics", "skype",
        "onedrive", "github", "sharepoint", "microsoft viva", "viva engage",
        "satya nadella", "bill gates", "paul allen", "magnificent 7", "magnificent seven", "mag7"
    ],
    "GOOGL": [
        "googl", "goog", "google", "alphabet", "youtube", "gmail", "android", "chrome", "google maps",
        "google cloud", "google drive", "abc.xyz", "larry page", "sergey brin",
        "sundar pichai", "ruth porat", "hennessy", "ashkenazi", "magnificent 7", "magnificent seven", "mag7", "faang" 
    ],
    "AMZN": [
        "amzn", "amazon", "amazon.com", " aws ", "alexa", "kindle", "amazon echo", "amazon prime", "ec2",
        "prime video", "twitch", "audible", "metro goldwyn mayer", "mgm studios", "fire tablet",
        "jeff bezos", "bezos", "magnificent 7", "magnificent seven", "mag7", "faang"
    ],
    "NVDA": [
        "nvda", "nvidia", "geforce", "geforce now", "cuda", "nvidia rtx", "gtc", "blackwell",
        "nvidia drive", "nvidia jetson", "nvidia isaac", "tegra", "quantum computing",
        "jensen huang", "bill dally", "magnificent 7", "magnificent seven", "mag7"
    ],
    "META": [
        "meta ", "meta platforms", "facebook", "instagram", "whatsapp", "threads",
        "messenger", "zuckerberg", "mark zuckerberg", "meta quest", "metaverse",
        "the facebook inc", "magnificent 7", "magnificent seven", "mag7", "faang"
    ],
    "TSLA": [
        "tsla", "tesla", "elon musk", "musk", "model 3", "model s ", "model x ", "cybertruck",
        "powerwall", "megapack", "solar city", "tesla semi", "supercharger",
        "roadster", "solarcity", "electric vehicle", "gigafactory", "magnificent 7", "magnificent seven", "mag7"
    ]
}



# --- Settings ---
FUZZY_THRESHOLD = 80
CHUNKSIZE = 500_000  # adapt based on available RAM
TEXT_COLUMN = 'article_title_clean'
INDEX_COLUMN = 'index'

# --- Make sure index is set ---
#df = df.reset_index(drop=False)

# --- Loop through each stock individually ---
for stock, keywords in fuzzy_keywords.items():
    print(f"\n🔍 Labeling for: {stock}")
    matches = []

    # Process in chunks
    for start in tqdm(range(0, len(df), CHUNKSIZE), desc=f"{stock}"):
        end = min(start + CHUNKSIZE, len(df))
        chunk = df.iloc[start:end]

        for idx, text in zip(chunk[INDEX_COLUMN], chunk[TEXT_COLUMN]):
            if pd.isna(text): continue

            for keyword in keywords:
                score = fuzz.partial_ratio(keyword, text)
                if score >= FUZZY_THRESHOLD:
                    matches.append({
                        "index": idx,
                        "fuzzy_80_label": stock
                    })
                    break  # avoid double-labeling the same article for this stock

    # Save to CSV
    matches_df = pd.DataFrame(matches)
    output_path = f"fuzzy_keywords_80_labels_{stock}.csv"
    matches_df.to_csv(output_path, index=False)
    print(f"✅ Saved fuzzy labels for {stock} to {output_path}")



🔍 Labeling for: AAPL


AAPL: 100%|█████████████████████████████████████| 32/32 [18:22<00:00, 34.46s/it]


✅ Saved fuzzy labels for AAPL to fuzzy_keywords_80_labels_AAPL.csv

🔍 Labeling for: MSFT


MSFT: 100%|█████████████████████████████████████| 32/32 [18:43<00:00, 35.11s/it]


✅ Saved fuzzy labels for MSFT to fuzzy_keywords_80_labels_MSFT.csv

🔍 Labeling for: GOOGL


GOOGL: 100%|████████████████████████████████████| 32/32 [20:45<00:00, 38.91s/it]


✅ Saved fuzzy labels for GOOGL to fuzzy_keywords_80_labels_GOOGL.csv

🔍 Labeling for: AMZN


AMZN: 100%|█████████████████████████████████████| 32/32 [18:32<00:00, 34.78s/it]


✅ Saved fuzzy labels for AMZN to fuzzy_keywords_80_labels_AMZN.csv

🔍 Labeling for: NVDA


NVDA: 100%|█████████████████████████████████████| 32/32 [14:17<00:00, 26.79s/it]


✅ Saved fuzzy labels for NVDA to fuzzy_keywords_80_labels_NVDA.csv

🔍 Labeling for: META


META: 100%|█████████████████████████████████████| 32/32 [12:44<00:00, 23.90s/it]


✅ Saved fuzzy labels for META to fuzzy_keywords_80_labels_META.csv

🔍 Labeling for: TSLA


TSLA: 100%|█████████████████████████████████████| 32/32 [15:58<00:00, 29.96s/it]


✅ Saved fuzzy labels for TSLA to fuzzy_keywords_80_labels_TSLA.csv


In [27]:
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz

# --- Define your fuzzy keyword dictionary ---
fuzzy_keywords = {
    "AAPL": [
        "aapl", "apple", "apple inc", "steve jobs", "tim cook", "ipad", "iphone", "mac ", "ios ", "macintosh",
        "airpods", "apple watch", "apple tv", "apple card", "apple pay",
        "icloud", "app store", "apple music", "wozniak", "steve wozniak", "magnificent 7",
        "magnificent seven", "mag7", "faang"
    ]
}

# --- Settings ---
FUZZY_THRESHOLD = 85
CHUNKSIZE = 500_000  # adapt based on available RAM
TEXT_COLUMN = 'article_title_clean'
INDEX_COLUMN = 'index'

# --- Make sure index is set ---
#df = df.reset_index(drop=False)

# --- Loop through each stock individually ---
for stock, keywords in fuzzy_keywords.items():
    print(f"\n🔍 Labeling for: {stock}")
    matches = []

    # Process in chunks
    for start in tqdm(range(0, len(df), CHUNKSIZE), desc=f"{stock}"):
        end = min(start + CHUNKSIZE, len(df))
        chunk = df.iloc[start:end]

        for idx, text in zip(chunk[INDEX_COLUMN], chunk[TEXT_COLUMN]):
            if pd.isna(text): continue

            for keyword in keywords:
                score = fuzz.partial_ratio(keyword, text)
                if score >= FUZZY_THRESHOLD:
                    matches.append({
                        "index": idx,
                        "fuzzy_85_label": stock
                    })
                    break  # avoid double-labeling the same article for this stock

    # Save to CSV
    matches_df = pd.DataFrame(matches)
    output_path = f"fuzzy_keywords_85_labels_{stock}.csv"
    matches_df.to_csv(output_path, index=False)
    print(f"✅ Saved fuzzy labels for {stock} to {output_path}")



🔍 Labeling for: AAPL


AAPL: 100%|█████████████████████████████████████| 32/32 [22:20<00:00, 41.89s/it]


✅ Saved fuzzy labels for AAPL to fuzzy_keywords_85_labels_AAPL.csv


In [28]:
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz

# --- Define your fuzzy keyword dictionary ---
fuzzy_keywords = {
    "MSFT": [
        "msft", "microsoft", "microsoft office", "windows", "azure", "xbox", "bing", "linkedin",
        "visual studio", "microsoft teams", "microsoft 365", "microsoft dynamics", "skype",
        "onedrive", "github", "sharepoint", "microsoft viva", "viva engage",
        "satya nadella", "bill gates", "paul allen", "magnificent 7", "magnificent seven", "mag7"
    ],
    "GOOGL": [
        "googl", "goog", "google", "alphabet", "youtube", "gmail", "android", "chrome", "google maps",
        "google cloud", "google drive", "abc.xyz", "larry page", "sergey brin",
        "sundar pichai", "ruth porat", "hennessy", "ashkenazi", "magnificent 7", "magnificent seven", "mag7", "faang" 
    ],
    "AMZN": [
        "amzn", "amazon", "amazon.com", " aws ", "alexa", "kindle", "amazon echo", "amazon prime", "ec2",
        "prime video", "twitch", "audible", "metro goldwyn mayer", "mgm studios", "fire tablet",
        "jeff bezos", "bezos", "magnificent 7", "magnificent seven", "mag7", "faang"
    ],
    "NVDA": [
        "nvda", "nvidia", "geforce", "geforce now", "cuda", "nvidia rtx", "gtc", "blackwell",
        "nvidia drive", "nvidia jetson", "nvidia isaac", "tegra", "quantum computing",
        "jensen huang", "bill dally", "magnificent 7", "magnificent seven", "mag7"
    ],
    "META": [
        "meta ", "meta platforms", "facebook", "instagram", "whatsapp", "threads",
        "messenger", "zuckerberg", "mark zuckerberg", "meta quest", "metaverse",
        "the facebook inc", "magnificent 7", "magnificent seven", "mag7", "faang"
    ],
    "TSLA": [
        "tsla", "tesla", "elon musk", "musk", "model 3", "model s ", "model x ", "cybertruck",
        "powerwall", "megapack", "solar city", "tesla semi", "supercharger",
        "roadster", "solarcity", "electric vehicle", "gigafactory", "magnificent 7", "magnificent seven", "mag7"
    ]
}



# --- Settings ---
FUZZY_THRESHOLD = 85
CHUNKSIZE = 500_000  # adapt based on available RAM
TEXT_COLUMN = 'article_title_clean'
INDEX_COLUMN = 'index'

# --- Make sure index is set ---
#df = df.reset_index(drop=False)

# --- Loop through each stock individually ---
for stock, keywords in fuzzy_keywords.items():
    print(f"\n🔍 Labeling for: {stock}")
    matches = []

    # Process in chunks
    for start in tqdm(range(0, len(df), CHUNKSIZE), desc=f"{stock}"):
        end = min(start + CHUNKSIZE, len(df))
        chunk = df.iloc[start:end]

        for idx, text in zip(chunk[INDEX_COLUMN], chunk[TEXT_COLUMN]):
            if pd.isna(text): continue

            for keyword in keywords:
                score = fuzz.partial_ratio(keyword, text)
                if score >= FUZZY_THRESHOLD:
                    matches.append({
                        "index": idx,
                        "fuzzy_85_label": stock
                    })
                    break  # avoid double-labeling the same article for this stock

    # Save to CSV
    matches_df = pd.DataFrame(matches)
    output_path = f"fuzzy_keywords_85_labels_{stock}.csv"
    matches_df.to_csv(output_path, index=False)
    print(f"✅ Saved fuzzy labels for {stock} to {output_path}")



🔍 Labeling for: MSFT


MSFT: 100%|█████████████████████████████████████| 32/32 [25:26<00:00, 47.69s/it]


✅ Saved fuzzy labels for MSFT to fuzzy_keywords_85_labels_MSFT.csv

🔍 Labeling for: GOOGL


GOOGL: 100%|████████████████████████████████████| 32/32 [19:38<00:00, 36.81s/it]


✅ Saved fuzzy labels for GOOGL to fuzzy_keywords_85_labels_GOOGL.csv

🔍 Labeling for: AMZN


AMZN: 100%|█████████████████████████████████████| 32/32 [17:09<00:00, 32.16s/it]


✅ Saved fuzzy labels for AMZN to fuzzy_keywords_85_labels_AMZN.csv

🔍 Labeling for: NVDA


NVDA: 100%|█████████████████████████████████████| 32/32 [14:45<00:00, 27.69s/it]


✅ Saved fuzzy labels for NVDA to fuzzy_keywords_85_labels_NVDA.csv

🔍 Labeling for: META


META: 100%|█████████████████████████████████████| 32/32 [14:37<00:00, 27.43s/it]


✅ Saved fuzzy labels for META to fuzzy_keywords_85_labels_META.csv

🔍 Labeling for: TSLA


TSLA: 100%|█████████████████████████████████████| 32/32 [17:12<00:00, 32.26s/it]


✅ Saved fuzzy labels for TSLA to fuzzy_keywords_85_labels_TSLA.csv
