<a href="https://colab.research.google.com/github/Quotermain/sber_stock_analysis/blob/summarized/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

from google.colab import drive
drive.mount('/content/drive')

In [2]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
import datetime

import pickle
  
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10)

import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [4]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):

    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        '''Берём только те значения из предсказаний и истинных лейблов, 
        которые равны классу текущей итерации (фильтруем)'''
        y_preds = preds_flat[labels_flat==label] # Тут и ошибочные предсказания
        y_true = labels_flat[labels_flat==label] # Тут будут только истинные
        
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [7]:
"""
Сбрасываю временные индексы на обычную нумерацию, т.к. теперь это просто задача
классификации. Также сбрасываю ненужные колонки.
"""

merged = pd.read_csv(
    'drive/MyDrive/data/summarized_texts.csv',
    index_col=0
)

merged.head(5)

Unnamed: 0,target,text
0,1,Оптимистичные данные по занятости в США поднял...
1,1,Во вторник азиатские фондовые биржи разделилис...
2,0,В среду азиатские фондовые биржи упали из-за с...
3,0,Азиатские фондовые рынки растут на фоне оптими...
4,1,В среду азиатские фондовые биржи разделились п...


In [8]:
"""Encodes labels to nums to put those in a model"""
label_dict = {'up': 0, 'down': 1, 'nothing': 2}
label_dict

{'down': 1, 'nothing': 2, 'up': 0}

# Loading prices

In [None]:
data = pd.read_csv(
    'drive/MyDrive/data/SBER.txt', 
    usecols=['<DATE>', '<TIME>', '<CLOSE>'],
    parse_dates=[['<DATE>', '<TIME>']],
    dayfirst=True
)
data.columns = ['datetime', 'close']
data.set_index('datetime', inplace=True)

data.head(1)

Unnamed: 0_level_0,close
datetime,Unnamed: 1_level_1
2009-01-11 10:30:00,23.01


###### Plots raw prices against time (per minute)

In [None]:
plt.plot(data.close[-200:])
plt.title('Closing price of SBERBANK')
plt.ylabel('Closing price')
plt.xlabel('Date_time')
plt.grid(False)
plt.show()

###### Creates target: returns after 1 hour

In [None]:
data['returns_per_hour'] = data.close.pct_change(periods=60).shift(-60)
data.head(6)

Unnamed: 0_level_0,close,returns_per_hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-11 10:30:00,23.01,0.005215
2009-01-11 10:31:00,23.02,0.005647
2009-01-11 10:32:00,23.2,-0.002155
2009-01-11 10:33:00,23.22,-0.003876
2009-01-11 10:34:00,23.24,-0.003873
2009-01-11 10:35:00,23.14,0.000432


###### Plots histrogram of returns after 1 hour and plots them against time

In [None]:
data.returns_per_hour.hist(bins=400)

In [None]:
data.returns_per_hour.plot()

###### Explores returns

In [None]:
mean_returns = data.returns_per_hour.mean()
mean_returns

0.00011119852638197501

In [None]:
std_returns = data.returns_per_hour.std()
std_returns

0.007516514598578941

In [None]:
conditions = [
    data.returns_per_hour > 0.0007,
    data.returns_per_hour < -0.0007
]

choices = ["up", "down"]

data['target'] = np.select(conditions, choices, default='nothing')

In [None]:
data.target.value_counts()

up         672272
down       659455
nothing    230470
Name: target, dtype: int64

# Working with news

#### Cleaning parsed news

In [None]:
news = pd.read_csv('drive/MyDrive/data/full_news.csv')

news.head()

In [None]:
"""
Cleanes the text column from HTML markup, source of news,
\n, \r, \t and excess spaces
"""

news.text = news.text.apply(
    lambda x: re.sub(
        '<[^<]+?>|\n|\r|\t|© Reuters.|Investing.com — |  +', '', x
    )
)


'''
Extracts date and time
'''
news.datetime = news.datetime.str.extract(
    r'(\d+\.\d+\.\d+ \d+:\d+)'
)

In [None]:
news.to_csv(
    'drive/MyDrive/data/full_news_cleaned.csv', 
    index=False
)

#### Loading cleaned news

In [None]:
news = pd.read_csv(
    'drive/MyDrive/data/full_news_cleaned.csv',
    dayfirst=True,
    parse_dates=True,
    index_col='datetime'
)

In [None]:
news.head()

# Merging prices and news

In [None]:
merged = data.merge(
    news, left_index=True, right_index=True
)

In [None]:
merged.head()

Unnamed: 0_level_0,close,returns_per_hour,target,title,text
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-09-06 11:08:00,82.13,-0.00621,down,Азиатские биржи в понедельник ушли в рост,Оптимистичные данные по занятости в США поднял...
2010-09-07 11:09:00,80.78,-0.002847,down,Азиатские фондовые биржи разделились по динамике,Во вторник азиатские фондовые биржи разделилис...
2010-09-08 10:39:00,79.68,0.013052,up,Азиатские биржи упали из-за сильной иены,В среду азиатские фондовые биржи упали из-за с...
2010-09-13 11:58:00,84.58,0.00402,up,Азиатские фондовые рынки растут на фоне оптими...,Азиатские фондовые рынки растут на фоне оптими...
2010-09-15 11:41:00,84.1,-0.00321,down,Азиатские фондовые биржи разделились по динами...,В среду азиатские фондовые биржи разделились п...


In [None]:
print('Merged data shape: ', merged.shape)
print('Num of "ups": ', len(merged[merged.target == 'up']))
print('Num of "downs": ', len(merged[merged.target == 'down']))
print('Num of "nothings": ', len(merged[merged.target == 'nothing']))

Merged data shape:  (52020, 5)
Num of "ups":  21960
Num of "downs":  22061
Num of "nothings":  7999


In [None]:
merged['target'] = merged.target.replace(label_dict)
merged.head(1)

In [None]:
merged.to_csv(
    'drive/MyDrive/data/merged.csv'
)

#Text summarization

In [None]:
pip install bert-extractive-summarizer

In [11]:
from summarizer import Summarizer

model = Summarizer()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=434.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [22]:
merged.text = merged.text.apply(lambda x: model(x, num_sentences=25))

  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)
  model = self.__get_model(k).fit(self.features)


In [23]:
merged.to_csv('drive/MyDrive/data/summarized_texts.csv')

In [24]:
merged.text.str.split().apply(lambda x: len(x)).max()

851

# Splitting the merged data on train and test

In [25]:
X_train, X_val, y_train, y_val = train_test_split(
    merged.index.values, 
    merged.target.values, 
    test_size=0.1, 
    stratify=merged.target.values,
    random_state=1
)

for key, value in label_dict.items():
  print(
      f"Ratio of '{key}s' in val and train",
      len(y_val[y_val == value]) / len(y_train[y_train == value])
  )

Ratio of 'ups' in val and train 0.1111111111111111
Ratio of 'downs' in val and train 0.11110551498363133
Ratio of 'nothings' in val and train 0.11112654535352132


In [26]:
np.savetxt('drive/MyDrive/data/summarized_text/X_train.txt', X_train)
np.savetxt('drive/MyDrive/data/summarized_text/X_val.txt', X_val)
np.savetxt('drive/MyDrive/data/summarized_text/y_train.txt', y_train)
np.savetxt('drive/MyDrive/data/summarized_text/y_val.txt', y_val)

In [27]:
X_train1 = np.loadtxt('drive/MyDrive/data/summarized_text/X_train.txt')
X_val1 = np.loadtxt('drive/MyDrive/data/summarized_text/X_val.txt')
y_train1 = np.loadtxt('drive/MyDrive/data/summarized_text/y_train.txt')
y_val1 = np.loadtxt('drive/MyDrive/data/summarized_text/y_val.txt')

In [28]:
print(all(X_train == X_train1))
print(all(X_val == X_val1))
print(all(y_train == y_train1))
print(all(y_val == y_val1))

True
True
True
True


In [29]:
print(all(merged.loc[X_train1, 'target'].values == y_train1))
print(all(merged.loc[X_val1, 'target'].values == y_val1))

True
True


#Preparing the data for a model

In [30]:
merged.target.value_counts()

1    22061
0    21960
2     7999
Name: target, dtype: int64

In [31]:
X_train = np.loadtxt('drive/MyDrive/data/summarized_text/X_train.txt')
X_val = np.loadtxt('drive/MyDrive/data/summarized_text/X_val.txt')
y_train = np.loadtxt('drive/MyDrive/data/summarized_text/y_train.txt')
y_val = np.loadtxt('drive/MyDrive/data/summarized_text/y_val.txt')

In [32]:
merged['data_type'] = ['not_set'] * merged.shape[0]

merged.loc[X_train, 'data_type'] = 'train'
merged.loc[X_val, 'data_type'] = 'val'

In [52]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-multilingual-cased', 
    do_lower_case=True
)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    merged.loc[X_train, 'text'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=450, 
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    merged.loc[X_val, 'text'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=450, 
    return_tensors='pt',
    truncation=True
)

In [53]:
with open(
    'drive/MyDrive/data/summarized_text/encoded_data_train.pickle', 'wb'
) as f:
  pickle.dump(encoded_data_train, f)

with open(
    'drive/MyDrive/data/summarized_text/encoded_data_val.pickle', 'wb'
) as f:
  pickle.dump(encoded_data_val, f)

# Building the model

In [9]:
with open(
    'drive/MyDrive/data/summarized_text/encoded_data_train.pickle', 'rb'
) as f:
  encoded_data_train = pickle.load(f)

with open(
    'drive/MyDrive/data/summarized_text/encoded_data_val.pickle', 'rb'
) as f:
  encoded_data_val = pickle.load(f)

In [10]:
y_train = np.loadtxt('drive/MyDrive/data/summarized_text/y_train.txt')
y_val = np.loadtxt('drive/MyDrive/data/summarized_text/y_val.txt')

In [11]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train, dtype=torch.long)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val, dtype=torch.long)

In [12]:
dataset_train = TensorDataset(
    input_ids_train, attention_masks_train, labels_train
)
dataset_val = TensorDataset(
    input_ids_val, attention_masks_val, labels_val
)

In [13]:
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)
device

device(type='cuda')

In [14]:
#ЗАМЕНИЛ МОДЕЛЬ!

model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

model.to(device)

'''model.load_state_dict(
    torch.load('drive/MyDrive/models/finetuned_BERT_epoch_3.model', 
    map_location=torch.device('cpu'))
)'''

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

"model.load_state_dict(\n    torch.load('drive/MyDrive/models/finetuned_BERT_epoch_3.model', \n    map_location=torch.device('cpu'))\n)"

In [24]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train, 
    sampler=RandomSampler(dataset_train), 
    batch_size=batch_size
)

dataloader_validation = DataLoader(
    dataset_val, 
    sampler=SequentialSampler(dataset_val), 
    batch_size=batch_size
)

In [25]:
optimizer = AdamW(
    model.parameters(),
    lr=1e-5, 
    eps=1e-8
)
                  
epochs = 100

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

#Training loop


In [None]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(
        dataloader_train, 
        desc='Epoch {:1d}'.format(epoch), 
        leave=False, 
        disable=False
    )
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix(
            {'training_loss': '{:.3f}'.format(loss.item()/len(batch))}
        )
         
        
    torch.save(
        model.state_dict(), 
        f'drive/MyDrive/models/finetuned_BERT_epoch_{epoch}.model'
    )
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=11705.0, style=ProgressStyle(description_wi…