In [70]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
snp500_files: set = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/snp500'))
print(len(snp500_files))
nasdaq_files: set = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nasdaq')) - snp500_files
print(len(nasdaq_files))
nyse_files: set = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nyse')) - snp500_files
print(len(nyse_files))
nasdaq_nyse: set = nasdaq_files.union(nyse_files)
print(len(nasdaq_nyse))


In [None]:
initial_stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/snp500/AMZN.csv')
initial_stock_data['Date'] = initial_stock_data['Date'].apply(lambda x: x[-4:])
annual_data = initial_stock_data.groupby('Date').mean()
annual_data['Annual Percent Change'] = annual_data['Close'].pct_change()
annual_data = annual_data.fillna(0)
print(annual_data.head(20))

In [None]:
news_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/toy_data/analyst_ratings_processed.csv', index_col=0)
news_data['date'] = news_data['date'].apply(lambda x: str(x)[:4])
print(news_data.head(30))


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device)

def get_sentiment(input_text: str, model=model, tokenizer=tokenizer, device=device):

    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**inputs).logits.to(device)

    return torch.nn.Softmax(dim=1)(logits)[0].tolist()

In [None]:
news_by_stock = news_data.groupby('stock')
already_processed = set(os.listdir('G:/My Drive/UdS/Classes/Data Science/stock_data'))
for stock in tqdm(news_by_stock):
    if stock[0] + '.csv' in already_processed:
        continue
    try:
        stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/snp500/' + stock[0] + '.csv')
        label = 1
    except FileNotFoundError:
        try:
            stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nasdaq/' + stock[0] + '.csv')
            label = 0
        except FileNotFoundError:
            try:
                stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/nasdaq_snp500_nyse/nyse/' + stock[0] + '.csv')
                label = 0
            except FileNotFoundError:
                continue
    stock_data['Date'] = stock_data['Date'].apply(lambda x: x[-4:])
    annual_data = stock_data.groupby('Date').mean()
    annual_data['Annual Percent Change'] = annual_data['Close'].pct_change()
    annual_data = annual_data.fillna(0)

    news_by_year = stock[1].groupby('date')
    sentiments_frame = pd.DataFrame(columns=['positive', 'negative', 'neutral'])
    sentiment_index = [year[0] for year in news_by_year]
    for year in news_by_year:
        stories = [year[1]['title'].iloc[:5].tolist()][0]

        sentiments = pd.DataFrame(columns=['positive', 'negative', 'neutral'], index=[year[0]])
        for story in stories:
            sentiment = get_sentiment(story)
            sentiments = sentiments.append({'positive': sentiment[0], 'negative': sentiment[1], 
                               'neutral': sentiment[2]}, ignore_index=True)
        
        mean_sentiments = sentiments.mean()
        sentiments_frame = sentiments_frame.append(mean_sentiments, ignore_index=True)
    sentiments_frame.index = sentiment_index
    combined_data = pd.concat([annual_data, sentiments_frame], axis=1)
    combined_data = combined_data.fillna(combined_data.mean())
    combined_data['Label'] = label
    combined_data = combined_data.reset_index(drop=True)
    combined_data.drop(combined_data[combined_data.Volume == 0].index, inplace=True)
    # combined_data.drop(columns=['Date'], inplace=True)
    # combined_data.to_csv(f'G:/My Drive/UdS/Classes/Data Science/stock_data/{stock[0]}.csv', index=False)
    break

In [None]:
prepared_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/stock_data')
#processed_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/truncated_data')

recent_year: dict = {}
for stock in tqdm(prepared_stock_list):
    if stock == 'desktop.ini':
        continue
    # if stock in processed_stock_list:
    #     continue
    stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/stock_data/' + stock)
    try:
        recent_year[stock[:-4]] = stock_data.iloc[-1]
    except IndexError:
        print(stock)
        print(stock_data)
        print(stock_data.head())
        continue
    # write to csv, truncating first and last line
    #stock_data[1:-1].to_csv('G:/My Drive/UdS/Classes/Data Science/truncated_data/' + stock, index=False)
    
recent_year_dataframe = pd.DataFrame.from_dict(recent_year, orient='index')

# recent_year_dataframe.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/toy_data/stonks.csv', index=True)

In [None]:
truncated_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/truncated_data')
#processed_stock_list = os.listdir('G:/My Drive/UdS/Classes/Data Science/truncated_data')

columns = ["Low","Open","Volume","High","Close","Adjusted Close","Annual Percent Change","positive","negative","neutral","Label"]
full_dataset = pd.DataFrame(columns=columns)
for stock in tqdm(truncated_stock_list):
    if stock == 'desktop.ini':
        continue
    stock_data = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/stock_data/' + stock)
    try:
        full_dataset = full_dataset.append(stock_data, ignore_index=True)
    except IndexError:
        print(stock)
        print(stock_data)
        print(stock_data.head())
        continue

In [None]:
y = full_dataset["Label"]
X = full_dataset.drop(columns=["Label"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80:20 train to test
X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.5, random_state=1) # 50:50 test to dev > 80:10:10 train:dev:test

# write splits to individual csv files
# X_train.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_train.csv', index=False)
# X_dev.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_dev.csv', index=False)
# X_test.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_test.csv', index=False)
# y_train.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_train.csv', index=False)
# y_dev.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_dev.csv', index=False)
# y_test.to_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_test.csv', index=False)

### Reload data for training

In [34]:
# reload splits from csv files
X_train = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_train.csv')
X_dev = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_dev.csv')
X_test = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/X_test.csv')
y_train = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_train.csv')
y_dev = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_dev.csv')
y_test = pd.read_csv('G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/finalized_data/y_test.csv')

# verify data integrity
print(X_train.shape)
print(X_dev.shape)
print(X_test.shape)
print(y_train.shape)
print(y_dev.shape)
print(y_test.shape)


(45859, 10)
(5733, 10)
(5732, 10)
(45859, 1)
(5733, 1)
(5732, 1)


In [95]:
import torch
import torch.nn as nn
import torch.utils.data as Data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_X = torch.tensor(X_train.values, dtype=torch.float32)
train_y = torch.tensor(y_train.values, dtype=torch.long)

class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        super(MyDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

traindata = MyDataset(train_X, train_y)
trainloader = torch.utils.data.DataLoader(traindata, batch_size=64, shuffle=True)


model = nn.Sequential(
        nn.Linear(10, 100),
        nn.ReLU(),
        nn.Linear(100, 50),
        nn.ReLU(),
        nn.Linear(50, 10),
        nn.ReLU(),
        nn.Linear(10, 1),
        nn.Sigmoid()
)

model.train()

#loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

for n in range(num_epochs):
    for X, y in trainloader:
        X = X
        y = y.squeeze(1)
        y_pred = model(X).squeeze(1)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print(model)

torch.save(model, "G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/models/multilayer_model.pickle")

Sequential(
  (0): Linear(in_features=10, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=50, bias=True)
  (3): ReLU()
  (4): Linear(in_features=50, out_features=10, bias=True)
  (5): ReLU()
  (6): Linear(in_features=10, out_features=1, bias=True)
  (7): Sigmoid()
)


In [97]:
import torch
import torch.nn as nn
import torch.utils.data as Data

dev_X = torch.tensor(X_dev.values, dtype=torch.float32)
dev_y = torch.tensor(y_dev.values, dtype=torch.long)

class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        super(MyDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

devdata = MyDataset(dev_X, dev_y)
devloader = torch.utils.data.DataLoader(devdata, batch_size=64, shuffle=True)

model = torch.load("G:/My Drive/UdS/Classes/Data Science/DS-Miniproject/models/multilayer_model.pickle")
print(model)
model.eval()

dev_predictions = pd.DataFrame(columns=['y_pred', 'y_true'])
with torch.no_grad():
    for X, y in devloader:
        X = X
        y = y.squeeze(1).tolist()
        y_pred = [int(y_val) for y_val in model(X).squeeze(1).tolist()]
        y_frame = pd.DataFrame({'y_pred': y_pred, 'y_true': y})
        dev_predictions = dev_predictions.append(y_frame, ignore_index=True)

confusion = pd.crosstab(dev_predictions['y_true'], dev_predictions['y_pred'], rownames=['True'], 
                        colnames=['Predicted'], margins=True)
print(confusion)

accuracy = (confusion[0][0] + confusion[1][1]) / confusion['All']['All']
print(accuracy)
precision = confusion[1][1] / confusion['All'][1]
print(precision)
recall = confusion[1][1] / confusion[1]['All']
print(recall)
f1 = 2 * (precision * recall) / (precision + recall)
print(f1)

Sequential(
  (0): Linear(in_features=10, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=50, bias=True)
  (3): ReLU()
  (4): Linear(in_features=50, out_features=10, bias=True)
  (5): ReLU()
  (6): Linear(in_features=10, out_features=1, bias=True)
  (7): Sigmoid()
)
Predicted     0   All
True                 
0          4970  4970
1           763   763
All        5733  5733


KeyError: 1