<a href="https://colab.research.google.com/github/Quotermain/sber_stock_analysis/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

from google.colab import drive
drive.mount('/content/drive')

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 7.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 21.9MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 30.9MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)

In [None]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
import datetime

import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)

import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):

    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        '''Берём только те значения из предсказаний и истинных лейблов, 
        которые равны классу текущей итерации (фильтруем)'''
        y_preds = preds_flat[labels_flat==label] # Тут и ошибочные предсказания
        y_true = labels_flat[labels_flat==label] # Тут будут только истинные
        
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

# Loading prices

In [None]:
data = pd.read_csv(
    'data/SBER.txt', 
    usecols=['<DATE>', '<TIME>', '<CLOSE>'],
    parse_dates=[['<DATE>', '<TIME>']],
    dayfirst=True
)
data.columns = ['datetime', 'close']
data.set_index('datetime', inplace=True)

data.head(1)

Unnamed: 0_level_0,close
datetime,Unnamed: 1_level_1
2009-01-11 10:30:00,23.01


###### Plots raw prices against time (per minute)

In [None]:
plt.plot(data.close[-200:])
plt.title('Closing price of SBERBANK')
plt.ylabel('Closing price')
plt.xlabel('Date_time')
plt.grid(False)
plt.show()

###### Creates target: returns after 1 hour

In [None]:
data['returns_per_hour'] = data.close.pct_change(periods=60).shift(-60)
data.head(6)

Unnamed: 0_level_0,close,returns_per_hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-11 10:30:00,23.01,0.005215
2009-01-11 10:31:00,23.02,0.005647
2009-01-11 10:32:00,23.2,-0.002155
2009-01-11 10:33:00,23.22,-0.003876
2009-01-11 10:34:00,23.24,-0.003873
2009-01-11 10:35:00,23.14,0.000432


###### Plots histrogram of returns after 1 hour and plots them against time

In [None]:
data.returns_per_hour.hist(bins=400)

In [None]:
data.returns_per_hour.plot()

###### Explores returns

In [None]:
mean_returns = data.returns_per_hour.mean()
mean_returns

0.00011119852638197501

In [None]:
std_returns = data.returns_per_hour.std()
std_returns

0.007516514598578941

In [None]:
conditions = [
    data.returns_per_hour > 0.0007,
    data.returns_per_hour < -0.0007
]

choices = ["up", "down"]

data['target'] = np.select(conditions, choices, default='nothing')

In [None]:
data.target.value_counts()

up         672272
down       659455
nothing    230470
Name: target, dtype: int64

# Working with news

#### Cleaning parsed news

In [None]:
news = pd.read_csv('data/full_news.csv')

news.head()

In [None]:
"""
Cleanes the text column from HTML markup, source of news,
\n, \r, \t and excess spaces
"""

news.text = news.text.apply(
    lambda x: re.sub(
        '<[^<]+?>|\n|\r|\t|© Reuters.|Investing.com — |  +', '', x
    )
)


'''
Extracts date and time
'''
news.datetime = news.datetime.str.extract(
    r'(\d+\.\d+\.\d+ \d+:\d+)'
)

In [None]:
news.to_csv(
    'data/full_news_cleaned.csv', 
    index=False
)

#### Loading cleaned news

In [None]:
news = pd.read_csv(
    'data/full_news_cleaned.csv',
    dayfirst=True,
    parse_dates=True,
    index_col='datetime'
)

In [None]:
news.head()

# Merging prices and news

In [None]:
merged = data.merge(
    news, left_index=True, right_index=True
)

In [None]:
merged.head()

Unnamed: 0_level_0,close,returns_per_hour,target,title,text
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-09-06 11:08:00,82.13,-0.00621,down,Азиатские биржи в понедельник ушли в рост,Оптимистичные данные по занятости в США поднял...
2010-09-07 11:09:00,80.78,-0.002847,down,Азиатские фондовые биржи разделились по динамике,Во вторник азиатские фондовые биржи разделилис...
2010-09-08 10:39:00,79.68,0.013052,up,Азиатские биржи упали из-за сильной иены,В среду азиатские фондовые биржи упали из-за с...
2010-09-13 11:58:00,84.58,0.00402,up,Азиатские фондовые рынки растут на фоне оптими...,Азиатские фондовые рынки растут на фоне оптими...
2010-09-15 11:41:00,84.1,-0.00321,down,Азиатские фондовые биржи разделились по динами...,В среду азиатские фондовые биржи разделились п...


In [None]:
print('Merged data shape: ', merged.shape)
print('Num of "ups": ', len(merged[merged.target == 'up']))
print('Num of "downs": ', len(merged[merged.target == 'down']))
print('Num of "nothings": ', len(merged[merged.target == 'nothing']))

Merged data shape:  (52020, 5)
Num of "ups":  21960
Num of "downs":  22061
Num of "nothings":  7999


In [None]:
merged.to_csv(
    'data/merged.csv'
)

# Splitting the merged data on train and test

In [None]:
"""Encodes labels to nums to put those in a model"""

label_dict = {'up': 0, 'down': 1, 'nothing': 2}
label_dict

merged['target'] = merged.target.replace(label_dict)
merged.head(1)

Unnamed: 0_level_0,close,returns_per_hour,target,title,text
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-09-06 11:08:00,82.13,-0.00621,1,Азиатские биржи в понедельник ушли в рост,Оптимистичные данные по занятости в США поднял...


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    merged.index.values, 
    merged.target.values, 
    test_size=0.15, 
    stratify=merged.target.values,
    random_state=1
)

for key, value in label_dict.items():
  print(
      f"Ratio of '{key}s' in val and train",
      len(y_val[y_val == value]) / len(y_train[y_train == value])
  )

Ratio of 'ups' in val and train 0.17647058823529413
Ratio of 'downs' in val and train 0.17646117747440274
Ratio of 'nothings' in val and train 0.17649654360935432


In [None]:
np.savetxt('data/X_train.txt', X_train, fmt='%s')
np.savetxt('data/X_val.txt', X_val, fmt='%s')
np.savetxt('data/y_train.txt', y_train, fmt='%s')
np.savetxt('data/y_val.txt', y_val, fmt='%s')

In [None]:
X_train1 = np.loadtxt('data/X_train.txt', dtype='<M8[ns]')
X_val1 = np.loadtxt('data/X_val.txt', dtype='<M8[ns]')
y_train1 = np.loadtxt('data/y_train.txt')
y_val1 = np.loadtxt('data/y_val.txt')

In [None]:
print(all(X_train == X_train1))
print(all(X_val == X_val1))
print(all(y_train == y_train1))
print(all(y_val == y_val1))

# Building the model

In [None]:
merged = pd.read_csv(
    'data/merged.csv',
    parse_dates=True,
    index_col=0
)

merged.head(1)

Unnamed: 0_level_0,close,returns_per_hour,target,title,text
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-09-06 11:08:00,82.13,-0.00621,down,Азиатские биржи в понедельник ушли в рост,Оптимистичные данные по занятости в США поднял...


In [None]:
merged.target.value_counts()

down       22061
up         21960
nothing     7999
Name: target, dtype: int64

In [None]:
X_train = np.loadtxt('data/X_train.txt', dtype='<M8[ns]')
X_val = np.loadtxt('data/X_val.txt', dtype='<M8[ns]')
y_train = np.loadtxt('data/y_train.txt')
y_val = np.loadtxt('data/y_val.txt')

In [None]:
merged['data_type'] = ['not_set'] * merged.shape[0]

merged.loc[X_train, 'data_type'] = 'train'
merged.loc[X_val, 'data_type'] = 'val'

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', 
    do_lower_case=True
)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    merged.loc[X_train, 'text'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=450, 
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    merged.loc[X_val, 'text'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=450, 
    return_tensors='pt',
    truncation=True
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(merged.loc[X_train, 'target'].values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(merged.loc[X_val, 'target'].values)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
device

device(type='cuda')

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

model.to(device)

model.load_state_dict(
    torch.load('drive/MyDrive/models/finetuned_BERT_epoch_5.model', 
    map_location=torch.device('cpu'))
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [None]:
batch_size = 16

dataloader_train = DataLoader(
    dataset_train, 
    sampler=RandomSampler(dataset_train), 
    batch_size=batch_size
)

dataloader_validation = DataLoader(
    dataset_val, 
    sampler=SequentialSampler(dataset_val), 
    batch_size=batch_size
)

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr=1e-5, 
    eps=1e-8
)
                  
epochs = 100

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(
        dataloader_train, 
        desc='Epoch {:1d}'.format(epoch), 
        leave=False, 
        disable=False
    )
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix(
            {'training_loss': '{:.3f}'.format(loss.item()/len(batch))}
        )
         
        
    torch.save(
        model.state_dict(), 
        f'drive/MyDrive/models/finetuned_BERT_epoch_{epoch}.model'
    )
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=3758.0, style=ProgressStyle(description_wid…


Epoch 1
Training loss: 0.1953293911115469
Validation loss: 0.47743884819833093
F1 Score (Weighted): 0.8801168409495067


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=3758.0, style=ProgressStyle(description_wid…


Epoch 2
Training loss: 0.17777668235632935
Validation loss: 0.5546916382863333
F1 Score (Weighted): 0.8790837795010547


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=3758.0, style=ProgressStyle(description_wid…


Epoch 3
Training loss: 0.16727517063055322
Validation loss: 0.6365506259117149
F1 Score (Weighted): 0.8707768340682781


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=3758.0, style=ProgressStyle(description_wid…

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

model.to(device)

model.load_state_dict(
    torch.load('drive/MyDrive/saved_models/finetuned_BERT_epoch_1.model', 
    map_location=torch.device('cpu'))
)

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Class: up
Accuracy: 469/752

Class: down
Accuracy: 255/753

Class: nothing
Accuracy: 8743/9066

