In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import datetime
from tqdm import tqdm
import pytz
from matplotlib import pyplot as plt
import warnings
from sklearn.metrics import accuracy_score
from datetime import date
import random
from scipy.optimize import minimize

warnings.simplefilter(action='ignore', category=FutureWarning)
tz = pytz.timezone("America/New_York")

In [2]:
yt_df = pd.read_csv('../Sentiment analysis/youtube_sentiments_2024.csv')

date_new = []
for idx,row in yt_df.iterrows():
    arr = row['Upload date'].split(" ")
    date = arr[0]
    date_new.append(date)

date_new_df=pd.DataFrame(date_new)
date_new_df.columns = ['date_new']
yt_df = yt_df.join(date_new_df)
yt_df['date_new'] = pd.to_datetime(yt_df['date_new'])
yt_df['Upload date'] = yt_df['date_new'].apply(lambda x: x.date())
yt_df.pop('date_new')
yt_df

Unnamed: 0.1,Unnamed: 0,Video ID,Title,Channel,Upload date,Stock,Summary,Company,Symbol,Sentiment
0,0,YEoJq_PcOgc,What can reignite Apple shares?,CNBC Television,2024-02-28,1,i don't expect anything to happen at the share...,Apple,AAPL,2
1,1,BOm0zNiaNjg,Alphabet's AI problems: Stock falls 4%,CNBC Television,2024-02-26,3,The stock of alphabet is down 4%. The stock ne...,Alphabet,GOOGL,0
2,2,cTncTPylZQ8,Apple shares touch a 4-month low,CNBC Television,2024-03-05,1,i'm not particularly very bullish on the eye w...,Apple,AAPL,1
3,3,qAITe2Hn8Hc,"Three-Stock Lunch: Broadcom, Kroger & Costco",CNBC Television,2024-03-07,217,Kroger is up 23% over the past month. This has...,Kroger,KR,0
4,4,qAITe2Hn8Hc,"Three-Stock Lunch: Broadcom, Kroger & Costco",CNBC Television,2024-03-07,23,"finally, costco reporting results after the be...",Costco,COST,1
...,...,...,...,...,...,...,...,...,...,...
213,213,xbU_b4Pwank,"Target foresees sales rebound, plans new store...",Reuters,2024-03-06,107,target is aiming for a better year in 2024. on...,Target,TGT,2
214,214,q1DoDWjQkDk,Tesla steps up EV price war in China | REUTERS,Reuters,2024-03-01,11,tesla is stepping up a price war over electric...,Tesla,TSLA,2
215,215,gUwKeUkdBVw,Meta resolves issue after thousands report out...,NBC News,2024-03-05,5,thousands of meta users were not able to sign ...,Meta,META,0
216,216,xOqh-4THhF8,Texas Gov. Greg Abbott on border crisis and Tr...,ABC News,2024-03-06,37,,Abbott,ABT,1


In [5]:
def price_moving_level (symbol,date,window,threshold_price):

    stock = yf.Ticker(symbol)
    start_date =  date - datetime.timedelta(days=1)
    end_date = date + datetime.timedelta(days=30)

    start_date = start_date.strftime('%Y-%m-%d')
    end_date = end_date.strftime('%Y-%m-%d')
    df_stock = stock.history(start = start_date, end =end_date).reset_index()
    p = -1

    if df_stock.empty:
        return p

    else:
        df_stock["Date"] = pd.to_datetime(df_stock["Date"])
        df_stock['date'] = df_stock['Date'].dt.date
        c0 = df_stock[df_stock['date']==date]['Close'].values

        if c0.size ==0:
            return p

        else:
            next_idx = df_stock[df_stock['date']==date].index + window
            c1=df_stock.loc[next_idx]['Close'].values

            c = (c1[0]-c0[0] )/c0[0]

            if c > threshold_price:
                p = 2
            elif c < -threshold_price :
                p = 0
            else:
                p = 1

            return p


def evaluate_yt(channel,window,threshold_price):
    df_all = yt_df[yt_df['Channel']==channel]
    dates = df_all['Upload date'].value_counts()

    sen_arr = []
    price_arr = []
    ticker_arr = []

    for date, cnt_date in tqdm(dates.items()):
        df_today = df_all[df_all['Upload date'] == date]
        symbols= df_today['Symbol'].value_counts()

        for sym, cnt_sym in symbols.items():
            df_sen = df_today[df_today['Symbol']==sym]
            sen_sum = df_sen['Sentiment'].sum()
            price_level = price_moving_level(sym,date,window,threshold_price)

            if price_level >=0:
                sen_avg = sen_sum /cnt_sym
                sen_arr.append(round(sen_avg))
                price_arr.append(price_level)
                ticker_arr.append(sym)

    acc = accuracy_score(price_arr,sen_arr)
    return acc, price_arr, sen_arr

In [6]:
yt_eval_df =  pd.DataFrame(columns = ['Channel', 'Timeframe', 'Accuracy'])
channels = yt_df['Channel'].value_counts()

for i in range(10):
  for cha, cnt_cha in channels.items():
    acc, price_arr, sen_arr = evaluate_yt(cha,i+1,0.01)
    new_row = {'Channel':cha,'Timeframe':i+1, 'Accuracy':acc}
    yt_eval_df = pd.concat([yt_eval_df, pd.DataFrame([new_row])], ignore_index = True)

yt_eval_df

11it [00:09,  1.11it/s]
12it [00:05,  2.21it/s]
11it [00:02,  4.57it/s]
5it [00:00,  8.87it/s]
1it [00:00, 16.12it/s]
1it [00:00,  5.03it/s]
1it [00:00, 42.62it/s]
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
11it [00:06,  1.74it/s]
12it [00:04,  2.72it/s]
11it [00:01,  5.64it/s]
5it [00:00,  9.03it/s]
1it [00:00, 13.20it/s]
1it [00:00,  6.15it/s]
1it [00:00, 26.14it/s]
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
11it [00:05,  1.85it/s]
12it [00:04,  2.64it/s]
11it [00:02,  4.73it/s]
5it [00:00, 10.55it/s]
1it [00:00, 15.71it/s]
1it [00:00,  9.51it/s]
1it [00:00, 41.95it/s]
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
11it [00:05,  1.85it/s]
12it [00:05,  2.32it/s]
11it [00:01,  5.52it/s]
5it [00:00,  9.62it/s]
1it [00:00, 17.56it/s]
1it [00:00,  7.56it/s]
1it [00:00, 39.22it/s]
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
11it [00:06,  1.81it/s]
12it [00:04,  2.55it/s]
11it

Unnamed: 0,Channel,Timeframe,Accuracy
0,CNBC Television,1,0.388889
1,Yahoo Finance,1,0.500000
2,Bloomberg Television,1,0.500000
3,Reuters,1,0.125000
4,NBC News,1,0.000000
...,...,...,...
65,Bloomberg Television,10,0.230769
66,Reuters,10,0.250000
67,NBC News,10,0.000000
68,ABC News,10,0.000000


In [11]:
yt_acc={}
yt_acc = yt_acc.fromkeys(channels.keys(),[0,0])
for cha, cnt_cha in channels.items():
    avg_acc = yt_eval_df[yt_eval_df['Channel']==cha]['Accuracy'].mean()
    yt_acc[cha] = avg_acc

yt_acc = sorted(yt_acc.items(), key=lambda kv:(kv[1], kv[0]),reverse=True)
yt_acc

[('Bloomberg Television', 0.36923076923076925),
 ('Yahoo Finance', 0.3125),
 ('CNBC Television', 0.3069444444444444),
 ('ABC News', 0.3),
 ('Reuters', 0.225),
 ('NBC News', 0.2),
 ('CBS News', nan)]

In [12]:
yt_eval_df.to_csv('youtube_eval_2024.csv')