In [1]:
import pandas as pd
import numpy as np
import re

In [40]:
df = pd.read_csv('/home/sacsresta/Documents/RESEARCH/Project/notebook/MERGED/merged_data_AAPL_from_2015-01-01.csv')
df

import pandas as pd
import re
from typing import Dict

def clean_alpaca_news(
    df: pd.DataFrame, 
    ticker: str = "AAPL",
    custom_keywords: list = None,
    false_positives: list = None
) -> pd.DataFrame:

    # Initialize configurations
    ticker_keywords = [
        ticker.lower(), 'iphone', 'ipad', 'ios', 'mac', 'apple watch',
        'app store', 'tim cook', 'icloud', 'itunes'
    ]
    if custom_keywords:
        ticker_keywords += [kw.lower() for kw in custom_keywords]

    false_positives = false_positives or [
        'pineapple', 'apple fruit', 'apple pie', 'grapple', 'big apple'
    ]

    # Preprocessing pipeline
    def _clean_headline(text):
        text = str(text).lower()
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'(\-\w+)\b', '', text)  # Remove source tags
        text = re.sub(r'\b(says|reports|according)\b.*$', '', text)
        return text.strip()
    
    processed = (
        df.loc[df['Ticker'] == ticker]
        .assign(headline=lambda x: x['headline'].str.split(' \| '))
        .explode('headline')
        .assign(cleaned=lambda x: x['headline'].apply(_clean_headline))
        .drop_duplicates(['Ticker', 'Date', 'cleaned'])
        .sort_values('Date')
    )

    # Relevance validation
    pos_pattern = r'\b(' + '|'.join(ticker_keywords) + r')\b'
    neg_pattern = r'\b(' + '|'.join(false_positives) + r')\b'
    
    processed['is_apple'] = (
        processed['cleaned']
        .str.contains(pos_pattern, regex=True)
        .astype(int)
    ) & (
        ~processed['cleaned']
        .str.contains(neg_pattern, regex=True)
        .astype(int)
    )

    processed = processed[processed['is_apple'] == 1]

    # Group and augment
    grouped = processed.groupby(['Ticker', 'Date']).agg(
        Headline_Count=('cleaned', 'count'),
        Headlines=('headline', list),
        Cleaned_Headlines=('cleaned', list),
        Mentioned_Tickers=('cleaned', 
            lambda x: list(set(re.findall(r'\b([A-Z]{2,4})\b', ' '.join(x))))
        )
    ).reset_index()

    return grouped.assign(
        First_Headline=lambda x: x['Headlines'].str[0],
        Date=lambda x: pd.to_datetime(x['Date'])
    )

# Customize for Apple with domain-specific keywords
apple_news_processed = clean_alpaca_news(
    df,
    custom_keywords=[
        'macbook', 'siri', 'face id', 'app store', 'airpods',
        'apple pay', 'homepod', 'carplay','Apple'
    ],
    false_positives=[
        'pineapple express', 'apple bank', 'snapple', 'appleby'
    ]
)

print(f"Cleaned Apple News Structure:")
print(apple_news_processed.iloc[0:3].to_string(max_colwidth=30))


  processed['cleaned']
  ~processed['cleaned']


Cleaned Apple News Structure:
  Ticker       Date  Headline_Count                      Headlines              Cleaned_Headlines Mentioned_Tickers                 First_Headline
0   AAPL 2015-01-02               3  [Argus Expects Apple To Re...  [argus expects apple to re...                []  Argus Expects Apple To Rep...
1   AAPL 2015-01-03               3  [European Apple Sites Now ...  [european apple sites now ...                []  European Apple Sites Now S...
2   AAPL 2015-01-05               3  [Is Marco Arment Correct I...  [is marco arment correct i...                []  Is Marco Arment Correct In...


In [39]:
apple_news_processed['Cleaned_Headlines'][0]

['argus expects apple to report strong q1 results, sees double digit eps growth through 2016',
 'hearing chatter of overheating in apple iphone 6',
 'study: only five percent of u.s. iphone users are very likely to buy apple watch']

In [37]:
final_df['headline'][0]

['Argus Expects Apple To Report Strong Q1 Results, Sees Double Digit EPS Growth Through 2016',
 'Hearing Chatter of Overheating in Apple iPhone 6',
 'Study: Only Five Percent of U.S. iPhone Users Are Very Likely To Buy Apple watch']

In [30]:
df = df.assign(headline=df['headline'].str.split(' \| ')).explode('headline')
df['headline'] = df['headline'].str.strip()
df['headline'] = df['headline'].str.replace(r'\s+', ' ', regex=True)  # Remove extra spaces

def is_apple_related(text):
    text_lower = text.lower()
    # Positive indicators
    apple_keywords = r'\b(aapl|apple|iphone|ipad|mac|apple watch|ios|app store|tim cook)\b'
    # Negative indicators (common false positives)
    non_apple = r'\b(apples fruit|pineapple|applecare|applejack|grapple)\b'
    
    return 1 if (re.search(apple_keywords, text_lower) 
                and not re.search(non_apple, text_lower)) else 0

df['is_relevant'] = df['headline'].apply(is_apple_related)

# Get Apple-specific news
apple_mask = df['is_relevant'] == 1
apple_news = df[apple_mask].copy()

# Quality check: Validate non-matching cases
non_apple_samples = df[~apple_mask].sample(5)
print("Non-relevant samples validation:")
print(non_apple_samples[['Date', 'headline']])
# Group cleaned data by date
final_df = apple_news.groupby(['Ticker', 'Date']).agg({
    'headline': list,
    'is_relevant': 'sum'
}).reset_index()

# Add metadata columns
final_df['headline_count'] = final_df['headline'].str.len()
final_df['first_headline'] = final_df['headline'].str[0]

Non-relevant samples validation:
            Date                                           headline
128   2015-05-19                     Talking 'Applesphere' With UBS
1615  2020-03-27  Huawei Releases Latest Flagship Phone Amid Glo...
2419  2022-08-12  Have Tech Stocks Bottomed? World's Largest Hed...
2732  2023-06-29  US Federal Trade Commission Plans to Prosecute...
1405  2019-07-31  21 Technology Stocks Moving In Today's Pre-Mar...


In [14]:
df = df.assign(headline=df['headline'].str.split(' \| ')).explode('headline')
df

Unnamed: 0,Ticker,Date,headline
0,AAPL,2015-01-01,CNBC's Biggest Pops & Drops of 2014
1,AAPL,2015-01-02,Argus Expects Apple To Report Strong Q1 Result...
1,AAPL,2015-01-02,Hearing Chatter of Overheating in Apple iPhone 6
1,AAPL,2015-01-02,Study: Only Five Percent of U.S. iPhone Users ...
1,AAPL,2015-01-02,Companies To Watch As Music Streaming Explodes
...,...,...,...
3333,AAPL,2025-03-06,Comparing Apple With Industry Competitors In T...
3333,AAPL,2025-03-06,"SPY ETF Has Ruled For Decades, But Is It Still..."
3334,AAPL,2025-03-07,"Elizabeth Warren Wants To Know From Elon Musk,..."
3334,AAPL,2025-03-07,Will Your Next Smartphone Be 3-D Printed? Anal...


In [15]:
df['headline'] = df['headline'].str.strip()


In [16]:
df['headline']

0                     CNBC's Biggest Pops & Drops of 2014
1       Argus Expects Apple To Report Strong Q1 Result...
1        Hearing Chatter of Overheating in Apple iPhone 6
1       Study: Only Five Percent of U.S. iPhone Users ...
1          Companies To Watch As Music Streaming Explodes
                              ...                        
3333    Comparing Apple With Industry Competitors In T...
3333    SPY ETF Has Ruled For Decades, But Is It Still...
3334    Elizabeth Warren Wants To Know From Elon Musk,...
3334    Will Your Next Smartphone Be 3-D Printed? Anal...
Name: headline, Length: 35202, dtype: object

In [17]:
df['headline'] = df['headline'].str.replace(r'\s+', ' ', regex=True)  # Remove extra spaces

In [18]:
df['headline']

0                     CNBC's Biggest Pops & Drops of 2014
1       Argus Expects Apple To Report Strong Q1 Result...
1        Hearing Chatter of Overheating in Apple iPhone 6
1       Study: Only Five Percent of U.S. iPhone Users ...
1          Companies To Watch As Music Streaming Explodes
                              ...                        
3333    Comparing Apple With Industry Competitors In T...
3333    SPY ETF Has Ruled For Decades, But Is It Still...
3334    Elizabeth Warren Wants To Know From Elon Musk,...
3334    Will Your Next Smartphone Be 3-D Printed? Anal...
Name: headline, Length: 35202, dtype: object

In [19]:
def is_apple_related(text):
    text_lower = text.lower()
    # Positive indicators
    apple_keywords = r'\b(aapl|apple|iphone|ipad|mac|apple watch|ios|app store|tim cook)\b'
    # Negative indicators (common false positives)
    non_apple = r'\b(apples fruit|pineapple|applecare|applejack|grapple)\b'
    
    return 1 if (re.search(apple_keywords, text_lower) 
                and not re.search(non_apple, text_lower)) else 0

df['is_relevant'] = df['headline'].apply(is_apple_related)

In [20]:
df

Unnamed: 0,Ticker,Date,headline,is_relevant
0,AAPL,2015-01-01,CNBC's Biggest Pops & Drops of 2014,0
1,AAPL,2015-01-02,Argus Expects Apple To Report Strong Q1 Result...,1
1,AAPL,2015-01-02,Hearing Chatter of Overheating in Apple iPhone 6,1
1,AAPL,2015-01-02,Study: Only Five Percent of U.S. iPhone Users ...,1
1,AAPL,2015-01-02,Companies To Watch As Music Streaming Explodes,0
...,...,...,...,...
3333,AAPL,2025-03-06,Comparing Apple With Industry Competitors In T...,1
3333,AAPL,2025-03-06,"SPY ETF Has Ruled For Decades, But Is It Still...",0
3334,AAPL,2025-03-07,"Elizabeth Warren Wants To Know From Elon Musk,...",0
3334,AAPL,2025-03-07,Will Your Next Smartphone Be 3-D Printed? Anal...,0


In [21]:
# Get Apple-specific news
apple_mask = df['is_relevant'] == 1
apple_news = df[apple_mask].copy()

# Quality check: Validate non-matching cases
non_apple_samples = df[~apple_mask].sample(5)
print("Non-relevant samples validation:")
print(non_apple_samples[['Date', 'headline']])

Non-relevant samples validation:
            Date                                           headline
2458  2022-09-20  US Fed Policy Meeting Among Biggest Macro Cata...
2035  2021-07-09  10 Information Technology Stocks With Unusual ...
2486  2022-10-18  T-Mobile US' Pricing Power Of 5G Is A Pivot Po...
3112  2024-07-24  Top 10 Trending Stocks On WallStreetBets As Of...
2776  2023-08-14  Berkshire Hathaway Is Betting On US Housing Ma...


In [22]:
apple_news

Unnamed: 0,Ticker,Date,headline,is_relevant
1,AAPL,2015-01-02,Argus Expects Apple To Report Strong Q1 Result...,1
1,AAPL,2015-01-02,Hearing Chatter of Overheating in Apple iPhone 6,1
1,AAPL,2015-01-02,Study: Only Five Percent of U.S. iPhone Users ...,1
2,AAPL,2015-01-03,European Apple Sites Now Show Watch 'Available...,1
2,AAPL,2015-01-03,"Weekly Highlights: Apple Pay Growth, Apple Wat...",1
...,...,...,...,...
3332,AAPL,2025-03-05,Assessing Apple's Performance Against Competit...,1
3332,AAPL,2025-03-05,Apple's Latest MacBook Air With M4 Chip Is Twi...,1
3333,AAPL,2025-03-06,"Apple Analyst (Ming-Chi Kuo Wrote Earlier, App...",1
3333,AAPL,2025-03-06,"Apple's Foldable AI iPhone Could Cost Over $2,...",1


In [23]:
# Group cleaned data by date
final_df = apple_news.groupby(['Ticker', 'Date']).agg({
    'headline': list,
    'is_relevant': 'sum'
}).reset_index()

# Add metadata columns
final_df['headline_count'] = final_df['headline'].str.len()
final_df['first_headline'] = final_df['headline'].str[0]

In [24]:
# Group cleaned data by date
final_df = apple_news.groupby(['Ticker', 'Date']).agg({
    'headline': list,
    'is_relevant': 'sum'
}).reset_index()

# Add metadata columns
final_df['headline_count'] = final_df['headline'].str.len()
final_df['first_headline'] = final_df['headline'].str[0]

In [25]:
def extract_mentioned_tickers(text):
    tickers = re.findall(r'\b([A-Z]{2,4})\b', text)
    return list(set(tickers)) if tickers else []

final_df['mentioned_tickers'] = final_df['headline'].apply(
    lambda x: [extract_mentioned_tickers(h) for h in x]
)

Unnamed: 0,Ticker,Date,headline,is_relevant,headline_count,first_headline,mentioned_tickers
0,AAPL,2015-01-02,[Argus Expects Apple To Report Strong Q1 Resul...,3,3,Argus Expects Apple To Report Strong Q1 Result...,"[[EPS], [], []]"
1,AAPL,2015-01-03,[European Apple Sites Now Show Watch 'Availabl...,3,3,European Apple Sites Now Show Watch 'Available...,"[[US], [], []]"
2,AAPL,2015-01-05,[Purported Photo Surfaces of 12-inch+ iPad Pro...,3,3,Purported Photo Surfaces of 12-inch+ iPad Pro,"[[], [], []]"
3,AAPL,2015-01-06,[Hearing Craig-Hallum Says Doesn't Believe IDT...,3,3,Hearing Craig-Hallum Says Doesn't Believe IDTI...,"[[IDTI], [], [CES]]"
4,AAPL,2015-01-07,[Monster Cable Products Sues Beats Electronics...,2,2,"Monster Cable Products Sues Beats Electronics,...","[[DJ], []]"
...,...,...,...,...,...,...,...
3099,AAPL,2025-03-02,"[Apple's $500 Billion Investment, Age Detectio...",2,2,"Apple's $500 Billion Investment, Age Detection...","[[AI], [AI]]"
3100,AAPL,2025-03-03,[Google And China's Honor Deepen AI Partnershi...,6,6,Google And China's Honor Deepen AI Partnership...,"[[AI], [UK], [], [], [AIR], []]"
3101,AAPL,2025-03-04,[Tim Cook Says 'There's Something In The Air' ...,5,5,Tim Cook Says 'There's Something In The Air' —...,"[[], [], [], [GCT], [UK]]"
3102,AAPL,2025-03-05,[Apple Challenges UK's 'Backdoor' Order In Lan...,4,4,Apple Challenges UK's 'Backdoor' Order In Land...,"[[UK], [AI], [], []]"
