In [27]:
import pandas as pd
from textblob import TextBlob

In [28]:
# --- CNBC ---
cnbc = pd.read_csv("../data/headlines/cnbc_headlines.csv")

# Rename Columns
cnbc = cnbc.rename(columns={"Headlines": "headline", "Time": "date"})

# Clean and Parse Dates
cnbc['date'] = cnbc['date'].str.replace("ET", "", regex=False).str.strip()
cnbc['date'] = pd.to_datetime(cnbc['date'], errors='coerce', dayfirst=True)

# Add source column
cnbc['source'] = "CNBC"

# --- Guardian ---
guardian = pd.read_csv("../data/headlines/guardian_headlines.csv")
guardian = guardian.rename(columns={"Headlines": "headline", "Time": "date"})
guardian['date'] = pd.to_datetime(guardian['date'], errors='coerce', dayfirst=True)
guardian['source'] = "Guardian"

# --- Reuters ---
reuters = pd.read_csv("../data/headlines/reuters_headlines.csv")
reuters = reuters.rename(columns={"Headlines": "headline", "Time": "date"})
reuters['date'] = pd.to_datetime(reuters['date'], errors='coerce', dayfirst=True)
reuters['source'] = "Reuters"

# --- Combine ---
all_news = pd.concat([cnbc, guardian, reuters], ignore_index=True)
all_news = all_news.dropna(subset=['date'])

print(all_news.head())


                                            headline                date  \
0  Jim Cramer: A better way to invest in the Covi... 2020-07-17 19:51:00   
1     Cramer's lightning round: I would own Teradyne 2020-07-17 19:33:00   
3  Cramer's week ahead: Big week for earnings, ev... 2020-07-17 19:25:00   
4  IQ Capital CEO Keith Bliss says tech and healt... 2020-07-17 16:24:00   
5  Wall Street delivered the 'kind of pullback I'... 2020-07-16 19:36:00   

                                         Description source  
0  "Mad Money" host Jim Cramer recommended buying...   CNBC  
1  "Mad Money" host Jim Cramer rings the lightnin...   CNBC  
3  "We'll pay more for the earnings of the non-Co...   CNBC  
4  Keith Bliss, IQ Capital CEO, joins "Closing Be...   CNBC  
5  "Look for the stocks of high-quality companies...   CNBC  


  cnbc['date'] = pd.to_datetime(cnbc['date'], errors='coerce', dayfirst=True)
  guardian['date'] = pd.to_datetime(guardian['date'], errors='coerce', dayfirst=True)


In [29]:
# Ensure dates match stock trading days as news after 4PM affects the next day
all_news['date'] = all_news['date'].dt.floor('d')  

# Sentiment Analysis using textblob to get sentiment score for each headline
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity  # between -1 and 1

all_news['sentiment'] = all_news['headline'].astype(str).apply(get_sentiment)

# Aggregate sentiment by date
daily_sentiment = all_news.groupby('date')['sentiment'].mean()


In [32]:
import yfinance as yf

stock = yf.download("^GSPC", start=all_news['date'].min(), end=all_news['date'].max())

# If MultiIndex (e.g., multiple tickers), flatten it
if isinstance(stock.columns, pd.MultiIndex):
    stock.columns = ['_'.join(col).strip() for col in stock.columns.values]

# Now, pick the right Close column
close_col = [col for col in stock.columns if "Close" in col][0]  # first match
stock['return'] = stock[close_col].pct_change()

# Merge with daily sentiment
merged = stock.merge(daily_sentiment, left_index=True, right_index=True, how='left')
print(merged.head())

  stock = yf.download("^GSPC", start=all_news['date'].min(), end=all_news['date'].max())
[*********************100%***********************]  1 of 1 completed

            Close_^GSPC   High_^GSPC    Low_^GSPC   Open_^GSPC  Volume_^GSPC  \
Date                                                                           
2017-12-18  2690.159912  2694.969971  2685.919922  2685.919922    3727770000   
2017-12-19  2681.469971  2694.439941  2680.739990  2692.709961    3407680000   
2017-12-20  2679.250000  2691.010010  2676.110107  2688.179932    3246230000   
2017-12-21  2684.570068  2692.639893  2682.399902  2683.020020    3293130000   
2017-12-22  2683.340088  2685.350098  2678.129883  2684.219971    2401030000   

              return  sentiment  
Date                             
2017-12-18       NaN   0.010788  
2017-12-19 -0.003230   0.000777  
2017-12-20 -0.000828   0.011461  
2017-12-21  0.001986   0.009790  
2017-12-22 -0.000458   0.026984  





In [31]:
print(all_news['headline'].head(20))


0     Jim Cramer: A better way to invest in the Covi...
1        Cramer's lightning round: I would own Teradyne
3     Cramer's week ahead: Big week for earnings, ev...
4     IQ Capital CEO Keith Bliss says tech and healt...
5     Wall Street delivered the 'kind of pullback I'...
6     Cramer's lightning round: I would just stay lo...
7     Acorns CEO: Parents can turn $5 into five figu...
8     Dividend cuts may mean rethinking your retirem...
10    StockX has authenticated 1 million Jordan snea...
11    Biohaven Pharmaceuticals lands Khloe Kardashia...
12         Cramer's lightning round: I like Beyond Meat
13    Cramer: We desperately need another round of f...
14            Cramer's lightning round: Buy more VMware
15    Wall Street did something 'highly unusual' in ...
16    Charts suggest the S&P 500 climb will stall ou...
17    Salesforce's Marc Benioff: Face masks can end ...
18    Crown Castle's 'good story can get even better...
19    Ellevest's Sallie Krawcheck says the econo