In [54]:
from datetime import datetime, timedelta
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from stocksent import Sentiment
import torch
import torch.utils.data as Data
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import yahoo_fin.stock_info as si

In [None]:
snp500_files: set = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/snp500'))
print(len(snp500_files))
nasdaq_files: set = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nasdaq')) - snp500_files
print(len(nasdaq_files))
nyse_files: set = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nyse')) - snp500_files
print(len(nyse_files))
nasdaq_nyse: set = nasdaq_files.union(nyse_files)
print(len(nasdaq_nyse))


In [None]:
initial_stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/snp500/AMZN.csv')
initial_stock_data['Date'] = initial_stock_data['Date'].apply(lambda x: x[-4:])
annual_data = initial_stock_data.groupby('Date').mean()
annual_data['Annual Percent Change'] = annual_data['Close'].pct_change()
annual_data = annual_data.fillna(0)
print(annual_data.head(20))

In [None]:
news_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/toy_data/analyst_ratings_processed.csv', index_col=0)
news_data['date'] = news_data['date'].apply(lambda x: str(x)[:4])
print(news_data.head(30))


In [51]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)

def get_sentiment(input_text: str, model=model, tokenizer=tokenizer, device=device):

    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits.to(device)

    return torch.nn.Softmax(dim=1)(logits)[0].tolist()

In [None]:
news_by_stock = news_data.groupby('stock')
already_processed = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/stock_data'))
for stock in tqdm(news_by_stock):
    if stock[0] + '.csv' in already_processed:
        continue
    try:
        stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/snp500/' + stock[0] + '.csv')
        label = 1
    except FileNotFoundError:
        try:
            stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nasdaq/' + stock[0] + '.csv')
            label = 0
        except FileNotFoundError:
            try:
                stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nyse/' + stock[0] + '.csv')
                label = 0
            except FileNotFoundError:
                continue
    stock_data['Date'] = stock_data['Date'].apply(lambda x: x[-4:])
    annual_data = stock_data.groupby('Date').mean()
    annual_data['Annual Percent Change'] = annual_data['Close'].pct_change()
    annual_data = annual_data.fillna(0)

    news_by_year = stock[1].groupby('date')
    sentiments_frame = pd.DataFrame(columns=['positive', 'negative', 'neutral'])
    sentiment_index = [year[0] for year in news_by_year]
    for year in news_by_year:
        stories = [year[1]['title'].iloc[:5].tolist()][0]

        sentiments = pd.DataFrame(columns=['positive', 'negative', 'neutral'], index=[year[0]])
        for story in stories:
            sentiment = get_sentiment(story)
            sentiments = sentiments.append({'positive': sentiment[0], 'negative': sentiment[1], 
                               'neutral': sentiment[2]}, ignore_index=True)
        
        mean_sentiments = sentiments.mean()
        sentiments_frame = sentiments_frame.append(mean_sentiments, ignore_index=True)
    sentiments_frame.index = sentiment_index
    combined_data = pd.concat([annual_data, sentiments_frame], axis=1)
    combined_data = combined_data.fillna(combined_data.mean())
    combined_data['Label'] = label
    combined_data = combined_data.reset_index(drop=True)
    combined_data.drop(combined_data[combined_data.Volume == 0].index, inplace=True)
    # combined_data.drop(columns=['Date'], inplace=True)
    # combined_data.to_csv(f'G:/My Drive/UdS/Classes/Data Science/stock_data/{stock[0]}.csv', index=False)
    break

In [None]:
prepared_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/stock_data')
#processed_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/truncated_data')

recent_year: dict = {}
for stock in tqdm(prepared_stock_list):
    if stock == 'desktop.ini':
        continue
    # if stock in processed_stock_list:
    #     continue
    stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/stock_data/' + stock)
    try:
        recent_year[stock[:-4]] = stock_data.iloc[-1]
    except IndexError:
        print(stock)
        print(stock_data)
        print(stock_data.head())
        continue
    # write to csv, truncating first and last line
    #stock_data[1:-1].to_csv('G:/My Drive/UdS/Classes/Data Science/truncated_data/' + stock, index=False)
    
recent_year_dataframe = pd.DataFrame.from_dict(recent_year, orient='index')

# recent_year_dataframe.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/toy_data/stonks.csv', index=True)

In [None]:
truncated_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/truncated_data')
#processed_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/truncated_data')

columns = ["Low","Open","Volume","High","Close","Adjusted Close","Annual Percent Change","positive","negative","neutral","Label"]
full_dataset = pd.DataFrame(columns=columns)
for stock in tqdm(truncated_stock_list):
    if stock == 'desktop.ini':
        continue
    stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/stock_data/' + stock)
    try:
        full_dataset = full_dataset.append(stock_data, ignore_index=True)
    except IndexError:
        print(stock)
        print(stock_data)
        print(stock_data.head())
        continue

In [None]:
y = full_dataset["Label"]
X = full_dataset.drop(columns=["Label"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80:20 train to test
X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.5, random_state=1) # 50:50 test to dev > 80:10:10 train:dev:test

# write splits to individual csv files
# X_train.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_train.csv', index=False)
# X_dev.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_dev.csv', index=False)
# X_test.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_test.csv', index=False)
# y_train.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_train.csv', index=False)
# y_dev.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_dev.csv', index=False)
# y_test.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_test.csv', index=False)

### Reload data for training

In [2]:
# reload splits from csv files
X_train = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_train.csv')
X_dev = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_dev.csv')
X_test = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_test.csv')
y_train = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_train.csv')
y_dev = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_dev.csv')
y_test = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_test.csv')

# verify data integrity
print(X_train.shape)
print(X_dev.shape)
print(X_test.shape)
print(y_train.shape)
print(y_dev.shape)
print(y_test.shape)


(45859, 10)
(5733, 10)
(5732, 10)
(45859, 1)
(5733, 1)
(5732, 1)


In [62]:
# train = pd.concat([X_train, y_train], axis=1)
# dev = pd.concat([X_dev, y_dev], axis=1)
# test = pd.concat([X_test, y_test], axis=1)

# write splits to individual csv files
# train.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/train.csv', index=False)
# dev.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/dev.csv', index=False)
# test.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/test.csv', index=False)

# reload splits from csv files
train = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/train.csv')
dev = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/dev.csv')
test = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/test.csv')


In [63]:
print(train.shape)
print(dev.shape)
print(test.shape)

(45859, 11)
(5733, 11)
(5732, 11)


In [64]:
train_dataset = train.copy()
y_train_indices = train_dataset.index

y_train = [train_dataset.Label[i] for i in y_train_indices]
#y_train = train['Label']

class_sample_count = np.array(
    [len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
print(class_sample_count)
print(y_train)
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in y_train])
samples_weight = torch.from_numpy(samples_weight)
sampler = Data.WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))

train_dataset.drop(columns=["Label"], inplace=True)
print(train_dataset.head())
train_dataset = Data.TensorDataset(torch.tensor(train_dataset.values), torch.tensor(y_train))
print(train_dataset[0])

[39783  6076]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0

In [148]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# train_X = torch.tensor(X_train.values, dtype=torch.float32)
# train_y = torch.tensor(y_train.values, dtype=torch.long)

class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        super(MyDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

# traindata = MyDataset(train_X, train_y)
# trainloader = torch.utils.data.DataLoader(traindata, batch_size=64, shuffle=True)

train_dataloader = Data.DataLoader(train_dataset, batch_size=64, sampler=sampler, num_workers=0)


model = torch.nn.Sequential(
        torch.nn.Linear(10, 50),
        torch.nn.Sigmoid(),
        torch.nn.Linear(50, 1),
        torch.nn.Softmax()
)

model.train()

#loss_fn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
ep_log_interval = 100
loss_fn = torch.nn.MSELoss()
num_epochs = 100


for n in range(num_epochs):
    epoch_loss = 0.0
    for (batch_idx, batch) in enumerate(train_dataloader):
        X = batch[0].to(torch.float32)  # inputs
        Y = batch[1].to(torch.float32)  # correct class/label/politics
        optimizer.zero_grad()
        oupt = model(X).squeeze(1).to(torch.float32)  # 1D tensor for each sample
        loss_val = loss_fn(oupt, Y)  # a tensor
        epoch_loss += loss_val.item()  # accumulate
        loss_val.backward()
        optimizer.step()
    print("epoch = %d, loss = %f" % (n, epoch_loss))

# for n in tqdm(range(num_epochs)):
#     for X, y in train_dataloader:
#         X = X.to(torch.float32)
#         y = y
#         y_pred = model(X).squeeze(1)
#         loss = loss_fn(y_pred, y).item()
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

print(model)

torch.save(model, "G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/models/multilayer_model.pickle")

  input = module(input)


epoch = 0, loss = 360.509375
epoch = 1, loss = 358.152232
epoch = 2, loss = 357.876339
epoch = 3, loss = 361.764286
epoch = 4, loss = 359.485714
epoch = 5, loss = 356.331696
epoch = 6, loss = 358.493304
epoch = 7, loss = 359.175446
epoch = 8, loss = 356.300893
epoch = 9, loss = 358.951786
epoch = 10, loss = 358.709821
epoch = 11, loss = 360.141518
epoch = 12, loss = 355.360714
epoch = 13, loss = 358.074107
epoch = 14, loss = 356.540179
epoch = 15, loss = 360.396875
epoch = 16, loss = 360.160268
epoch = 17, loss = 360.133929
epoch = 18, loss = 355.209821
epoch = 19, loss = 357.629018
epoch = 20, loss = 358.366071


KeyboardInterrupt: 

In [117]:
# dev_X = torch.tensor(X_dev.values, dtype=torch.float32)
# dev_y = torch.tensor(y_dev.values, dtype=torch.long)

dev_data = dev.copy()

y_dev = [dev_data.Label[i] for i in dev_data.index]
dev_data.drop(columns=["Label"], inplace=True)
dev_dataset = Data.TensorDataset(torch.tensor(dev_data.values), torch.tensor(y_dev))

class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        super(MyDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]


dev_loader = Data.DataLoader(dev_dataset, batch_size=64)

model = torch.load("G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/models/multilayer_model.pickle")
print(model)
model.eval()

dev_predictions = pd.DataFrame(columns=['y_pred', 'y_true'])
with torch.no_grad():
    for X, y in dev_loader:
        X = X.to(torch.float32)
        y = y
        y_pred = model(X).squeeze(1)
        y_frame = pd.DataFrame({'y_pred': y_pred, 'y_true': y})
        dev_predictions = dev_predictions.append(y_frame, ignore_index=True)

confusion = pd.crosstab(dev_predictions['y_true'], dev_predictions['y_pred'], rownames=['True'], 
                        colnames=['Predicted'], margins=True)
print(confusion)

try:
    accuracy = (confusion[0][0] + confusion[1][1]) / confusion['All']['All']
except KeyError:
    accuracy = (confusion[0][0]) / confusion['All']['All']
print("accuracy:", accuracy)
try:
    precision = confusion[1][1] / confusion['All'][1]
except KeyError:
    precision = 0
print("precision:", precision)
try:
    recall = confusion[1][1] / confusion[1]['All']
except KeyError:
    recall = 0
print("recall:", recall)
try:
    f1 = 2 * (precision * recall) / (precision + recall)
except ZeroDivisionError:
    f1 = 0
print("f1:", f1)

Sequential(
  (0): Linear(in_features=10, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=50, bias=True)
  (3): ReLU()
  (4): Linear(in_features=50, out_features=1, bias=True)
  (5): LogSoftmax(dim=1)
)
Predicted   0.0   All
True                 
0          4970  4970
1           763   763
All        5733  5733
accuracy: 0.8669108669108669
precision: 0
recall: 0
f1: 0


In [71]:
def get_stock_data(ticker: str) -> list:
    today: datetime = datetime.today()
    yesteryear: str = (today - timedelta(days=345)).strftime('%Y-%m-%d')
    column_names: "list[str]" = ["low","open","volume","high","close","adjclose","Annual Percent Change","positive","negative","neutral"]

    stock_news = Sentiment("AAPL")
    sentiment_score = stock_news.get_dataframe(days=1)
    stories = sentiment_score['headline'].tolist()[:10]
    sentiments: "list[float]" = []

    for story in stories:
        sentiment = get_sentiment(story)
        sentiments.append(sentiment)

    sentiments_df = pd.DataFrame(sentiments, columns=['positive', 'negative', 'neutral'])
    mean_sentiments: pd.Series = sentiments_df.mean()
    stock_sentiment: "list[float]" = mean_sentiments.values.tolist()

    ticker_info: pd.DataFrame = si.get_data(ticker, start_date=yesteryear, end_date=today, interval="1mo")
    ticker_info = ticker_info.drop(columns=['ticker'])
    ticker_info['Annual Percent Change'] = (ticker_info.iloc[-1]['close'] - ticker_info.iloc[0]['close']) / ticker_info.iloc[0]['close']
    annualized_ticker_info: pd.Series = ticker_info.mean()

    annualized_ticker_info['positive'] = stock_sentiment[0]
    annualized_ticker_info['negative'] = stock_sentiment[1]
    annualized_ticker_info['neutral'] = stock_sentiment[2]

    annualized_ticker_info = annualized_ticker_info[column_names] # reorder columns

    return annualized_ticker_info.values.tolist()


In [72]:
print(get_stock_data("AAPL"))

  news_df['date'] = pd.to_datetime(news_df.date).dt.date


[148.56334114074707, 156.36500040690103, 1357570592.6666667, 166.29083379109701, 158.2683308919271, 157.8627700805664, 0.17008017609430517, 0.15964583940804006, 0.3270332055166364, 0.513320953771472]
