### 处理tweet及其情感

In [59]:
import pandas as pd
import os
from pytz import timezone
import datetime

tweets_sent_fpath = '/Users/sow/Documents/CU/FA21/EECS6893_BDA/PJ/data/sents'

# 获取今天的tweet_sent数据
nowdate = datetime.datetime.now(timezone('US/Eastern'))
now = nowdate.strftime("%Y%m%d")
fname = '{}.csv'.format(now)

df = pd.read_csv(os.path.join(tweets_sent_fpath, fname), index_col=0, dtype=str).reset_index(drop=True)
print(df.shape)
df.head()

(714, 7)


Unnamed: 0,Date,Hour,neg,neu,pos,compound,tweet
0,20211202,0,0.0,1.0,0.0,0.0,"I think what happened, here, is $TSLA original..."
1,20211202,0,0.075,0.796,0.129,0.2732,RT @markets: BREAKING: Apple tells suppliers i...
2,20211202,0,0.224,0.581,0.195,-0.368,This is only making ppl want their Cybertruck ...
3,20211202,0,0.114,0.716,0.17,0.2732,RT @Investingcom: *APPLE TELLS SUPPLIERS IPHON...
4,20211202,0,0.0,1.0,0.0,0.0,RT @RJRCapital: Wait...you fucking said this a...


In [60]:
import re

def findall_index(tweet):
    index_re = r"\$[A-Z]+"
    index_lst = re.findall(index_re, tweet)
    return [x[1:].upper() for x in index_lst]

# 读取NASDAQ100的股票代码
index_fpth = '/Users/sow/Documents/CU/FA21/EECS6893_BDA/PJ/data/NASDAQ100.txt'
with open(index_fpth, 'r') as f:
    NASDAQ100 = [x.strip('\n') for x in f.readlines()]
    
data_df = []
data_colnames = ['stock', 'Date', 'Hour', 'compound']
for _, row in df.iterrows():
    index_lst = findall_index(row['tweet'])
    row_data = row[['Date', 'Hour', 'compound']].values
    
    # No stock index
    if len(index_lst) == 0:
        continue
    
    for stock_index in index_lst:
        if stock_index in NASDAQ100:
            new_row = list(row_data).copy()
            new_row.insert(0, stock_index)
            data_df.append(new_row)
        
data_df = pd.DataFrame(data_df, columns=data_colnames)
print(data_df.shape)

data_df.to_csv('/Users/sow/Documents/CU/FA21/EECS6893_BDA/PJ/data/processed_tweet_sent/{}.csv'.format(now))
data_df.head(5)

(538, 4)


Unnamed: 0,stock,Date,Hour,compound
0,TSLA,20211202,0,0.0
1,AAPL,20211202,0,0.2732
2,AAPL,20211202,0,0.2732
3,AAPL,20211202,0,0.0
4,STX,20211202,0,0.9186


### 制作当日数据集

In [57]:
import pandas as pd
import os
from pytz import timezone
import datetime

# 获取今天的stock数据
nowdate = datetime.datetime.now(timezone('US/Eastern'))
now = nowdate.strftime("%Y%m%d")
fname = '{}.csv'.format(now)

stock_fpath = '/Users/sow/Documents/CU/FA21/EECS6893_BDA/PJ/data/streaming_stock'
stock_df = pd.read_csv(os.path.join(stock_fpath, fname), index_col=0).reset_index(drop=True)
print(stock_df.shape)
stock_df.head()

(82, 10)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,index,prev_label
0,20211202,55.150002,56.609901,55.080002,56.150002,21627407,0.0,0,CSCO,True
1,20211202,26.91,27.608999,26.91,27.280001,3166893,0.0,0,LBTYK,True
2,20211202,201.820007,208.155396,201.509995,204.449997,3088876,0.0,0,VRTX,False
3,20211202,69.169998,70.285004,68.830002,69.669998,9411184,0.0,0,GILD,True
4,20211202,32.380001,33.18,32.110001,32.91,1119870,0.0,0,FOX,True


In [81]:
# 获取今天处理好的tweet情感数据
processed_fpath = '/Users/sow/Documents/CU/FA21/EECS6893_BDA/PJ/data/processed_tweet_sent'
processed_df = pd.read_csv(os.path.join(processed_fpath, fname), index_col=0).reset_index(drop=True)
processed_df['Hour'] = processed_df['Hour'].astype(str)

print(processed_df.shape)
processed_df.head()

(538, 4)


Unnamed: 0,stock,Date,Hour,compound
0,TSLA,20211202,0,0.0
1,AAPL,20211202,0,0.2732
2,AAPL,20211202,0,0.2732
3,AAPL,20211202,0,0.0
4,STX,20211202,0,0.9186


In [83]:
import numpy as np

X_df = {'stock':[], 'Date':[], 'avg': []}
for i in range(24):
    X_df[str(i)] = []

for stock, group in processed_df.groupby('stock'):
    X_df['stock'].append(stock)
    X_df['Date'].append(now)
    
    for i in range(24):
        if str(i) in group['Hour'].tolist():
            sents = group.loc[group['Hour']==str(i), 'compound'].values
            X_df[str(i)].append(np.mean(sents))
        else:
            X_df[str(i)].append(0)
    
    X_df['avg'].append(np.mean(group['compound'].values))
    
X_df = pd.DataFrame.from_dict(X_df)
print(X_df.shape)
X_df.head()

(41, 27)


Unnamed: 0,stock,Date,avg,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,AAL,20211202,0.480343,0.4404,0,0,0,0,0,0,...,0,0,0.487,0,0,0,0,0,0,0
1,AAPL,20211202,0.095788,0.135318,0,0,0,0,0,0,...,0,0,0.082807,0,0,0,0,0,0,0
2,ADBE,20211202,0.22088,0.5994,0,0,0,0,0,0,...,0,0,0.12625,0,0,0,0,0,0,0
3,ADP,20211202,0.6124,0.6124,0,0,0,0,0,0,...,0,0,0.6124,0,0,0,0,0,0,0
4,ADSK,20211202,0.5994,0.5994,0,0,0,0,0,0,...,0,0,0.0,0,0,0,0,0,0,0


In [84]:
X_df[['stock', 'Date']] = X_df[['stock', 'Date']].astype(str)
stock_df[['index', 'Date']] = stock_df[['index', 'Date']].astype(str)

X_df = X_df.merge(stock_df[['index', 'Date', 'prev_label']], left_on=['stock', 'Date'], right_on=['index', 'Date'], suffixes=(None, '_y'))\
           .filter(regex='^(?!.*_y)')

print(X_df.shape)
X_df.head()

(41, 29)


Unnamed: 0,stock,Date,avg,0,1,2,3,4,5,6,...,16,17,18,19,20,21,22,23,index,prev_label
0,AAL,20211202,0.480343,0.4404,0,0,0,0,0,0,...,0.487,0,0,0,0,0,0,0,AAL,True
1,AAPL,20211202,0.095788,0.135318,0,0,0,0,0,0,...,0.082807,0,0,0,0,0,0,0,AAPL,False
2,ADBE,20211202,0.22088,0.5994,0,0,0,0,0,0,...,0.12625,0,0,0,0,0,0,0,ADBE,False
3,ADP,20211202,0.6124,0.6124,0,0,0,0,0,0,...,0.6124,0,0,0,0,0,0,0,ADP,True
4,ADSK,20211202,0.5994,0.5994,0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,ADSK,True


In [85]:
X_df.loc[X_df['0']>0,:].head()

Unnamed: 0,stock,Date,avg,0,1,2,3,4,5,6,...,16,17,18,19,20,21,22,23,index,prev_label
0,AAL,20211202,0.480343,0.4404,0,0,0,0,0,0,...,0.487,0,0,0,0,0,0,0,AAL,True
1,AAPL,20211202,0.095788,0.135318,0,0,0,0,0,0,...,0.082807,0,0,0,0,0,0,0,AAPL,False
2,ADBE,20211202,0.22088,0.5994,0,0,0,0,0,0,...,0.12625,0,0,0,0,0,0,0,ADBE,False
3,ADP,20211202,0.6124,0.6124,0,0,0,0,0,0,...,0.6124,0,0,0,0,0,0,0,ADP,True
4,ADSK,20211202,0.5994,0.5994,0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0,ADSK,True


### 预测明日开盘

In [14]:
import pandas as pd
import numpy as np
import pickle as pkl
import datetime
from pytz import timezone

# 导入今日数据
nowdate = datetime.datetime.now(timezone('US/Eastern'))
now = nowdate.strftime("%Y%m%d")

data_fpath = '/Users/sow/Documents/CU/FA21/EECS6893_BDA/PJ/data/daily/{}.csv'.format(now)
X_df = pd.read_csv(data_fpath, index_col=0).reset_index(drop=True)

# 获取今日数据的特征
feat_colnames = ['avg', 'prev_rate']
X = X_df[feat_colnames].values

# 导入模型
model_fpath = '/Users/sow/Documents/CU/FA21/EECS6893_BDA/PJ/model/avg_svm.sav'
loaded_model = pkl.load(open(model_fpath, 'rb'))
result = loaded_model.predict_proba(X)[:,1]

# 生成结果
res_df = X_df.copy()[['stock', 'Date']]
res_df['pred_tomorrow'] = result
res_df = res_df.sort_values(by='pred_tomorrow', ascending=False).reset_index(drop=True)

res_df.head()

Unnamed: 0,stock,Date,pred_tomorrow
0,DISCA,20211202,0.564263
1,ATVI,20211202,0.560852
2,JD,20211202,0.560164
3,AAPL,20211202,0.559816
4,ILMN,20211202,0.559591


In [19]:
import datetime
# today = datetime.datetime.now(timezone('US/Eastern'))
today = datetime.date(2021,12,3)
nextday = today + datetime.timedelta(days= 7-today.weekday() if today.weekday()>3 else 1)
print(nextday)

2021-12-06
