In [1]:
# Keras in detais
# http://www.100byte.ru/python/factors/factors.html

# https://github.com/sismetanin/sentiment-analysis-of-tweets-in-russian/blob/master/Sentiment%20Analysis%20of%20Tweets%20in%20Russian%20using%20Convolutional%20Neural%20Networks.ipynb
# https://habr.com/ru/company/mailru/blog/417767/ cnn example
# https://realpython.com/sentiment-analysis-python/ for some information

In [2]:
# https://www.kaggle.com/wordcards/stock-market-tweets-wordcloud

In [1]:
# %load _header_import.py
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 20)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

import os
for dirname, _, filenames in os.walk('df_tweets_wordcloud/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

import warnings
warnings.simplefilter('ignore')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from pprint import pprint
from datetime import datetime
import collections
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud
from collections import Counter        

# for autoreload modules
%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to /home/sergey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sergey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sergey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# %load _load_target_data.py
path_data = '/mnt/files/workdata/work/python-scripts/prediction_analyzer/predict_stock_quotes/data/'

file_old = path_data + '21K-predict.csv'
df_old = pd.read_csv(file_old,  dtype=str)

print(df_old.shape)
# (21129, 8)

file_new = path_data + 'data-2021-06-10/trainingset _1_.xlsx'
df_new = pd.read_excel(file_new, dtype=str)

print(df_new.shape)
# (29006, 6)

df_old_sub = df_old[['title', 'Unnamed: 2']].copy()
df_old_sub.columns = ['text', 'SENTIMENT']

df_new_sub = df_new[['title', 'znak']].copy()
df_new_sub.columns = ['text', 'SENTIMENT']

mask = df_new_sub['text'].isin(df_old_sub['text'])
df_unique =  df_new_sub[~ mask].copy()
print(df_unique.shape[0])

# df_new_sub - dataset c новыми данными

# df_old_sub - исходный dataset
# df_unique - dataset с отобранными новыми данными

(21129, 8)
(29006, 6)
8078


### Prepare data

In [3]:
# %load _prepare_sentiment_data.py
_positive = 'positive'
_negative = 'negative'
_neutral = 'neutral'

sentiment_list = [_neutral, _positive, _negative]
replaced_dic = {'0': _neutral, '1': _positive, '2': _negative}

#  ---df_old_sub
df_old_sub['sentiment'] = df_old_sub['SENTIMENT'].replace(replaced_dic)
print('df_old_sub.shape= ', df_old_sub.shape[0])

mask = df_old_sub.sentiment.isin(sentiment_list)
df_old_sub = df_old_sub[mask].copy()
print('df_old_sub.shape with correct sentiment= ', df_old_sub.shape[0])

#  ---df_new_sub
df_new_sub['sentiment'] = df_new_sub['SENTIMENT'].replace(replaced_dic)
print('df_new_sub.shape =', df_new_sub.shape[0])

mask = df_new_sub.sentiment.isin(sentiment_list)
df_new_sub = df_new_sub[mask].copy()
print('df_new_sub.shape =', df_new_sub.shape[0])

# ---create df_unique
mask = df_new_sub['text'].isin(df_old_sub['text'])
df_unique =  df_new_sub[~ mask].copy()

print('df_unique.shape with correct sentiment= ', df_unique.shape[0])

url_pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def url(phrase):
    return url_pattern.sub('', phrase)

def prepare_data(dt: pd.DataFrame):
    mask = dt.text.notnull()
    dt = dt[mask].copy()

    dt['text'] = dt['text'].apply(url)

    dt.drop_duplicates(subset=['text'], keep='first', inplace=True)
    print(Counter(dt['sentiment']))
    
    return dt

# -- remove duplicate

print('df_old_sub.shape = ', df_old_sub.shape[0])
df_old_sub = prepare_data(df_old_sub)
print('df_old_sub.shape without duplicate = ', df_old_sub.shape[0])

print('df_unique.shape= ', df_unique.shape[0])
df_unique = prepare_data(df_unique)
print('df_unique.shape without duplicate = ', df_unique.shape[0])

df_old_sub.shape=  21129
df_old_sub.shape with correct sentiment=  20910
df_new_sub.shape = 29006
df_new_sub.shape = 28827
df_unique.shape with correct sentiment=  7988
df_old_sub.shape =  20910
Counter({'positive': 9305, 'neutral': 8452, 'negative': 2937})
df_old_sub.shape without duplicate =  20694
df_unique.shape=  7988
Counter({'neutral': 3545, 'positive': 3421, 'negative': 976})
df_unique.shape without duplicate =  7942


### PreProccess Text -------

In [4]:
from _arrange import arrange_text
# arrange create new clear column text2!!!!!

[nltk_data] Downloading package stopwords to /home/sergey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sergey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sergey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
arrange_text(df_old_sub)

In [6]:
# df_old_sub - исходный dataset
# df_unique - dataset с отобранными новыми данными

In [7]:
df_old_sub.drop(columns='text', inplace=True)
df_old_sub.rename(columns={'text2':'text'},inplace=True)
df_old_sub.reset_index(drop=True, inplace=True)

In [8]:
arrange_text(df_unique)

In [9]:
df_unique.drop(columns='text', inplace=True)
df_unique.rename(columns={'text2':'text'},inplace=True)
df_unique.reset_index(drop=True, inplace=True)

In [10]:
df = df_old_sub 

### Quick view of preprocessed tweets

In [11]:
# %load _word_cloud.py

In [12]:
stop

NameError: name 'stop' is not defined

In [13]:
word_dic =  collections.defaultdict(int)
for text in df['text']:
    for word in text.split():
        word_dic[word] += 1

word_df = pd.DataFrame.from_dict(word_dic, orient='index').rename(columns={0:'count'}).sort_values('count', ascending=False)  

q = word_df['count'].quantile(0.75)
mask = word_df['count'] >= q

COUNT_WORDS_DEFAULT = word_df[mask].shape[0]
COUNT_WORDS_DEFAULT = 1000


#### Particularly noteworthy are the words in small fonts, such as "volatility", "risk", "short interest", "covid",...

### ------------------------------ SPLIT TEACH TEST ------------------------------------

In [None]:
# df_old_sub - исходный dataset
# df_unique - dataset с отобранными новыми данными

In [None]:
#### --------------- constants for CNN 

In [14]:
research_sentiment = 'multiclass'

last_dence = [1, 3][1]
print('last_dence - ', last_dence)

loss_func = ['binary_crossentropy', 'sparse_categorical_crossentropy', 'categorical_crossentropy'][1]
print('loss_func - ', loss_func)

last_activation = ['sigmoid', 'softmax'][1]
print('last_activation - ', last_activation)

epochs1 = 10
epochs2 = 100

split_only_df = True

last_dence -  3
loss_func -  sparse_categorical_crossentropy
last_activation -  softmax


In [15]:
replaced_finbert = {_positive:0, _negative:1,_neutral:2} 

df_old_sub['label'] = df_old_sub['sentiment'].replace(replaced_finbert)
df_unique['label'] = df_unique['sentiment'].replace(replaced_finbert)

In [16]:
from sklearn.model_selection import train_test_split
dt_train, dt_test = train_test_split(df_old_sub, test_size=0.2, random_state=2)

dt_train = pd.concat([dt_train, df_unique])#,df_new_body - experiment with body

n_train = dt_train.shape[0]
class_weight = { k:n_train/v for k,v in Counter(dt_train['label']).items() }

x_train = dt_train['text'].values
x_test = dt_test['text'].values

y_train = dt_train['label'].values
y_test = dt_test['label'].values


In [17]:
print('class_weight - ', class_weight)
print(Counter(y_train))
print(Counter(dt_train['sentiment']))

print(Counter(y_test))
print(Counter(dt_test['sentiment']))

class_weight -  {1: 7.4391132705739444, 0: 2.2563323201621075, 2: 2.3675461486421185}
Counter({0: 10857, 2: 10347, 1: 3293})
Counter({'positive': 10857, 'neutral': 10347, 'negative': 3293})
Counter({0: 1869, 2: 1650, 1: 620})
Counter({'positive': 1869, 'neutral': 1650, 'negative': 620})


In [None]:
#### SPLIT TEACH TEST by body

In [None]:
# df_new_body = df_new[['body', 'znak']].copy()
# print(df_new_body.shape[0])

# df_new_body.columns = ['text', 'SENTIMENT']
# mask = df_new_body.text.isnull()

# df_new_body = df_new_body[~mask]
# print(df_new_body.shape[0])

# df_new_body['sentiment'] = df_new_body['SENTIMENT'].replace(replaced_dic)
# mask = df_new_body.sentiment.isin(sentiment_list)

# df_new_body = df_new_body[mask].copy()
# print(df_new_body.shape[0])

# # -- remove duplicate
# df_new_body = prepare_data(df_new_body)
# print(df_new_body.shape[0])

# arrange_text(df_new_body)

# df_new_body.drop(columns='text', inplace=True)
# df_new_body.rename(columns={'text2':'text'},inplace=True)
# df_new_body.reset_index(drop=True, inplace=True)

# replaced_finbert = {_positive:0, _negative:1,_neutral:2}
# df_new_body['label'] = df_new_body['sentiment'].replace(replaced_finbert)

# dt_train, dt_test = train_test_split(df_new_body, test_size=0.2, random_state=2)


# n_train = dt_train.shape[0]
# class_weight = { k:n_train/v for k,v in Counter(dt_train['label']).items() }

# x_train = dt_train['text'].values
# x_test = dt_test['text'].values

# y_train = dt_train['label'].values
# y_test = dt_test['label'].values

### 2. Defining metrics¶
#### Since Keras 2.0 metrics F-measure, precision, and recall have been removed, so the following code was found in the history of the repo.

In [18]:
from keras import backend as K
from keras import metrics

from _keras_metrics import precision, recall, f1, matthews_correlation
acc = metrics.Accuracy()


### 3. Preparing weights for the embedding layer
#### I used Word2Vec [] embeddings, which were obtained at the previous step. It's a computationally efficient model for learning word embeddings developed
#### y Google. The detailed guide of prepearing the embedding layer is availbale at https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [19]:
max_sentence_len = max(df_old_sub['text'].apply(lambda x: len(x.split(' '))))
max_sentence_len

43

In [21]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# SENTENCE_LENGTH =  max_sentence_len
SENTENCE_LENGTH = 43
# NUM = 100000
# NUM = COUNT_WORDS_DEFAULT
NUM = 1500

def get_sequences(tokenizer, x):
    sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequences, maxlen=SENTENCE_LENGTH)

tokenizer = Tokenizer(num_words=NUM)
tokenizer.fit_on_texts(x_train)

x_train_seq = get_sequences(tokenizer, x_train)
x_test_seq = get_sequences(tokenizer, x_test)

In [None]:
# x_test_seq

In [22]:
from datetime import date
today = date.today()
today = today.strftime("%Y-%m-%d")
today

'2021-06-15'

In [23]:
model_tokenizer_path = 'models/cnn_for_prod/tokenizer_model_{}'.format(today)
print(model_tokenizer_path)

models/cnn_for_prod/tokenizer_model_2021-06-15


In [24]:
import joblib
joblib.dump(tokenizer,model_tokenizer_path)

tokenizer2 = joblib.load(model_tokenizer_path)

x_test_seq2 = get_sequences(tokenizer2, x_test)
x_test_seq2

array([[  0,   0,   0, ...,   4,  26, 119],
       [  0,   0,   0, ...,  14,  64,  35],
       [  0,   0,   0, ...,  54, 446, 491],
       ...,
       [  0,   0,   0, ..., 582, 342, 449],
       [  0,   0,   0, ...,   9,   9,  26],
       [  0,   0,   0, ...,   9,   9,  26]], dtype=int32)

### Create model for Word2vec

In [25]:
model_Word2Vec_path = 'models/cnn_for_prod/word2vec_{}'.format(today)
print(model_Word2Vec_path)

tweets_path = 'models/cnn_for_prod/tweets.txt'
print(tweets_path)

models/cnn_for_prod/word2vec_2021-06-15
models/cnn_for_prod/tweets.txt


In [26]:
with open(tweets_path, 'w', encoding='utf-8') as f:
    # Считываем тексты твитов 
    for row in dt_train['text'].astype('str').values:        
        # Записываем предобработанные твиты в файл
        print(row, file=f)

In [27]:
import logging
import multiprocessing
import gensim
from gensim.models import Word2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Считываем файл с предобработанными твитами
data = gensim.models.word2vec.LineSentence(tweets_path)

# Обучаем модель 
size = 20#200
model = Word2Vec(data, size=size, window=5, min_count=3, workers=multiprocessing.cpu_count(), seed=123)

model.save(model_Word2Vec_path)

2021-06-15 11:43:25,158 : INFO : collecting all words and their counts
2021-06-15 11:43:25,158 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-06-15 11:43:25,188 : INFO : PROGRESS: at sentence #10000, processed 97440 words, keeping 14237 word types
2021-06-15 11:43:25,214 : INFO : PROGRESS: at sentence #20000, processed 197500 words, keeping 21044 word types
2021-06-15 11:43:25,228 : INFO : collected 23715 word types from a corpus of 241862 raw words and 24497 sentences
2021-06-15 11:43:25,228 : INFO : Loading a fresh vocabulary
2021-06-15 11:43:25,239 : INFO : effective_min_count=3 retains 8833 unique words (37% of original 23715, drops 14882)
2021-06-15 11:43:25,240 : INFO : effective_min_count=3 leaves 223236 word corpus (92% of original 241862, drops 18626)
2021-06-15 11:43:25,254 : INFO : deleting the raw counts dictionary of 23715 items
2021-06-15 11:43:25,254 : INFO : sample=0.001 downsamples 52 most-common words
2021-06-15 11:43:25,255 : INFO : d

### Load Word2Vec model

In [28]:
# Загружаем обученную модель
w2v_model = Word2Vec.load(model_Word2Vec_path)
DIM = w2v_model.vector_size 
# Инициализируем матрицу embedding слоя нулями
embedding_matrix = np.zeros((NUM, DIM))
# Добавляем NUM=100000 наиболее часто встречающихся слов из обучающей выборки в embedding слой
for word, i in tokenizer.word_index.items():
    if i >= NUM:
        break
    if word in w2v_model.wv.vocab.keys():
        embedding_matrix[i] = w2v_model.wv[word]

2021-06-15 11:43:33,604 : INFO : loading Word2Vec object from models/cnn_for_prod/word2vec_2021-06-15
2021-06-15 11:43:33,697 : INFO : loading wv recursively from models/cnn_for_prod/word2vec_2021-06-15.wv.* with mmap=None
2021-06-15 11:43:33,698 : INFO : setting ignored attribute vectors_norm to None
2021-06-15 11:43:33,699 : INFO : loading vocabulary recursively from models/cnn_for_prod/word2vec_2021-06-15.vocabulary.* with mmap=None
2021-06-15 11:43:33,699 : INFO : loading trainables recursively from models/cnn_for_prod/word2vec_2021-06-15.trainables.* with mmap=None
2021-06-15 11:43:33,699 : INFO : setting ignored attribute cum_table to None
2021-06-15 11:43:33,700 : INFO : loaded models/cnn_for_prod/word2vec_2021-06-15


### 4. Building the CNN¶

In [29]:
from keras.layers import Input
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import GlobalMaxPooling1D
from keras.layers import concatenate

tweet_input = Input(shape=(SENTENCE_LENGTH,), dtype='int32')
tweet_encoder = Embedding(NUM, DIM, input_length=SENTENCE_LENGTH, weights=[embedding_matrix], trainable=False)(tweet_input)

В разработанной архитектуре использованы фильтры с высотой h=(2, 3, 4, 5), которые предназначены для параллельной обработки биграмм, триграмм, 4-грамм и 5-грамм соответственно. 
Добавил в нейронную сеть по 10 свёрточных слоев для каждой высоты фильтра, функция активации — ReLU. С рекомендациями по поиску оптимальной высоты и количества фильтров можно ознакомиться в работе [2].

После обработки слоями свертки, карты признаков поступали на слои субдискретизации, где к ним применялась операция 1-max-pooling, тем самым извлекая наиболее значимые n-граммы из текста. 
На следующем этапе происходило объединение в общий вектор признаков (слой объединения), который подавался в скрытый полносвязный слой с 30 нейронами.
На последнем этапе итоговая карта признаков подавалась на выходной слой нейронной сети с сигмоидальной функцией активации.

Поскольку нейронные сети склонны к переобучению, после embedding-слоя и перед скрытым полносвязным слоем я добавил dropout-регуляризацию c вероятностью выброса вершины p=0.2.

In [30]:
from keras.layers import Dense, concatenate, Activation, Dropout
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from keras.utils import plot_model

branches = []
x = Dropout(0.2)(tweet_encoder)

for size, filters_count in [(2, 10), (3, 10), (4, 10), (5, 10), (6,10), (7,10)]: 
    for i in range(filters_count):
        # Добавляем слой свертки
        branch = Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(x)
        branch = GlobalMaxPooling1D()(branch)
        branches.append(branch)

x = concatenate(branches, axis=1)
x = Dropout(0.2)(x)
x = Dense(30, activation='relu')(x)
x = Dense(last_dence)(x)
output = Activation(last_activation)(x) 


model = Model(inputs=[tweet_input], outputs=[output])
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[precision, recall, f1, matthews_correlation])
model.compile(loss=loss_func, optimizer='adam', metrics=['accuracy']) # matthews_correlation adam
model._name = research_sentiment
# model.summary()

### 5. Training and evaluating the CNN

##### The dataset was divided into three parts: train dataset (60% of the entire dataset), validation dataset (20% of the entire dataset), and test dataset
##### (20% of the entire dataset). The loss function was minimized using the Adam optimizer with a learning rate of 0.001. The embedding layer, which was 
##### initialized with Word2Vec word embeddings, was frozen for the first 10 epochs. Then we train model from the previous step with best validation scores 
##### for additional 5 epochs with unfrozen embeddings and a learning rate of 0.0001. The best results in terms of F-measure was 77.67%.

На первом этапе обучения заморозил embedding-слой, все остальные слои обучались в течение 10 эпох:

Размер группы примеров, используемых для обучения: 32.
Размер валидационной выборки: 25%

In [31]:
y_train = K.constant(y_train)
y_test = K.constant(y_test)

In [32]:
class_weight

{1: 7.4391132705739444, 0: 2.2563323201621075, 2: 2.3675461486421185}

In [None]:
# class_weight
# {1: 7.145015105740181, 0: 2.2263313609467454, 2: 2.43384298735666}

In [None]:
# sample_weight = np.ones(shape=(len(y_train),))
# sample_weight[y_train == 1] = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

In [33]:
checkpoint = ModelCheckpoint("models/cnn/cnn-frozen-embed-{}".format(model.name) + "-{epoch:02d}-{val_accuracy:.4f}.hdf5",
                             monitor='val_accuracy', save_best_only=True, mode='auto', period=1)

# history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs1, validation_split=0.25, sample_weight=sample_weight, callbacks = [checkpoint])

history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs1, validation_split=0.25, class_weight=class_weight, callbacks = [checkpoint])





Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


###################################################

Затем выбрал модель с наивысшими показателями F-меры на валидационном наборе данных, т.е. модель, полученную на восьмой эпохе обучения (F1=0.7791). 

У модели разморозил embedding-слой, после чего запустил еще пять эпох обучения.

In [34]:
model.layers[1].trainable = True

adam = optimizers.Adam(lr=0.0001)
# rmsprop = optimizers.RMSprop(lr=0.001)


model.compile(loss=loss_func, optimizer=adam, metrics=['accuracy']) # matthews_correlation  precision, recall, f1, acc
# model.summary()

In [36]:
print(class_weight)
print('epoche2', epochs2)

{1: 7.4391132705739444, 0: 2.2563323201621075, 2: 2.3675461486421185}
epoche2 100


In [37]:
# for i in range(5):
checkpoint = ModelCheckpoint("models/cnn/loop-{}-cnn-frozen-embed-{}".format(i, model.name) + "-{epoch:02d}-{val_accuracy:.4f}.hdf5", 
                             monitor='val_accuracy', save_best_only=True, mode='auto', period=1)

# history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs2, validation_split=0.25, shuffle=True, 
#                     sample_weight=sample_weight, callbacks = [checkpoint])  
history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs2, validation_split=0.25, class_weight=class_weight, 
                    shuffle=True, callbacks = [checkpoint])





Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [38]:
def get_predict_cnn(x):
    if x[0] - x[1] > x[2]:
        return 0
    elif x[1] - x[0] > x[2]:
        return 1
    else:
        return 2

In [39]:
predict_proba = model.predict(x_test_seq)
dt_test['predict_proba'] = predict_proba.tolist()
dt_test['predict_label'] = dt_test.predict_proba.apply(lambda x: get_predict_cnn(x))
print('prediction like finbert')
print(classification_report(y_test, dt_test['predict_label'] , digits=5))

prediction like finbert
              precision    recall  f1-score   support

         0.0    0.87418   0.78812   0.82893      1869
         1.0    0.71664   0.77097   0.74281       620
         2.0    0.77616   0.84061   0.80710      1650

    accuracy                        0.80647      4139
   macro avg    0.78900   0.79990   0.79295      4139
weighted avg    0.81151   0.80647   0.80733      4139



In [None]:
# prediction like finbert
#               precision    recall  f1-score   support

#          0.0    0.86520   0.77956   0.82015      1869
#          1.0    0.72457   0.74677   0.73550       620
#          2.0    0.76432   0.84121   0.80092      1650

#     accuracy                        0.79923      4139
#    macro avg    0.78470   0.78918   0.78553      4139
# weighted avg    0.80392   0.79923   0.79981      4139

In [40]:
predict_proba = model.predict(x_test_seq)
dt_test['predict_proba'] = predict_proba.tolist()
dt_test['predict_label'] = dt_test.predict_proba.apply(lambda x: np.argmax(x,axis=0))
print(classification_report(y_test, dt_test['predict_label'] , digits=5))

              precision    recall  f1-score   support

         0.0    0.85706   0.81808   0.83712      1869
         1.0    0.67175   0.78226   0.72280       620
         2.0    0.81629   0.80788   0.81206      1650

    accuracy                        0.80865      4139
   macro avg    0.78170   0.80274   0.79066      4139
weighted avg    0.81305   0.80865   0.81001      4139



In [None]:
#            precision    recall  f1-score   support

#          0.0    0.85166   0.80792   0.82921      1869
#          1.0    0.66855   0.76452   0.71332       620
#          2.0    0.80688   0.81030   0.80859      1650

#     accuracy                        0.80237      4139
#    macro avg    0.77570   0.79425   0.78371      4139
# weighted avg    0.80638   0.80237   0.80363      4139

### SAVE MODEL 

In [None]:
cnn_path = 'models/cnn_for_prod/keras_cnn_{}_model_{}.hdf5'.format(research_sentiment,today)
print(cnn_path)


In [None]:
# model.save(cnn_path) 

In [None]:
stop

### Model_load

In [None]:
from keras.models import load_model
model2 = load_model(cnn_path)

In [None]:
predict_proba = model2.predict(x_test_seq)
dt_test['predict_proba'] = predict_proba.tolist()
dt_test['predict_label'] = dt_test.predict_proba.apply(lambda x: np.argmax(x,axis=0))
print(classification_report(y_test, dt_test['predict_label'] , digits=5))

In [None]:
### Retrain -----------------

In [None]:
x_valid = df_unique['text'].values
y_valid = df_unique['label'].values

x_valid_seq = get_sequences(tokenizer2, x_valid)

y_valid = K.constant(y_valid)

In [None]:
n_unique = df_unique.shape[0]
class_weight_valid = { k:n_train/v for k,v in Counter(df_unique['label']).items() }
class_weight_valid

In [None]:
model2_name = 'multiclass_v2'

###  Заморозить первый слой

In [None]:
model2.layers[1].trainable = False

checkpoint = ModelCheckpoint("models/cnn/cnn-frozen-embed-{}".format(model2_name) + "-{epoch:02d}-{val_accuracy:.4f}.hdf5",
                             monitor='val_accuracy', save_best_only=True, mode='auto', period=1)

history = model2.fit(x_valid_seq, y_valid, batch_size=32, epochs=epochs1, validation_split=0.25, class_weight=class_weight_valid, callbacks = [checkpoint])

In [None]:
### Разморозить первый слой

In [None]:
model2.layers[1].trainable = True

adam = optimizers.Adam(lr=0.0001)

model2.compile(loss=loss_func, optimizer=adam, metrics=['accuracy']) 

In [None]:
i = 2
epochs3 = 25

checkpoint = ModelCheckpoint("models/cnn/loop-{}-cnn-frozen-embed-{}".format(i, model2_name) + "-{epoch:02d}-{val_accuracy:.4f}.hdf5", 
                             monitor='val_accuracy', save_best_only=True, mode='auto', period=1)
 
history = model2.fit(x_train_seq, y_train, batch_size=32, epochs=epochs3, validation_split=0.25, class_weight=class_weight_valid, 
                    shuffle=True, callbacks = [checkpoint])

In [None]:
#          precision    recall  f1-score   support

#          0.0    0.85166   0.80792   0.82921      1869
#          1.0    0.66855   0.76452   0.71332       620
#          2.0    0.80688   0.81030   0.80859      1650

#     accuracy                        0.80237      4139
#    macro avg    0.77570   0.79425   0.78371      4139
# weighted avg    0.80638   0.80237   0.80363      4139

In [None]:
predict_proba = model2.predict(x_test_seq)
dt_test['predict_proba'] = predict_proba.tolist()
dt_test['predict_label'] = dt_test.predict_proba.apply(lambda x: np.argmax(x,axis=0))
print(classification_report(y_test, dt_test['predict_label'] , digits=5))

In [None]:
predict_proba = model.predict(x_test_seq)
dt_test['predict_proba'] = predict_proba.tolist()
dt_test['predict_label'] = dt_test.predict_proba.apply(lambda x: get_predict_cnn(x))
print('prediction like finbert')
print(classification_report(y_test, dt_test['predict_label'] , digits=5))

In [None]:
#### Remove code 

In [None]:
predict_proba = model(x_test_seq).numpy()
test['predict_proba'] = predict_proba.tolist()
test['predict_label'] = test.predict_proba.apply(lambda x: np.argmax(x,axis=0))
print(classification_report(y_test, test['predict_label'] , digits=5))

In [None]:
path_data = '/mnt/files/workdata/work/python-scripts/prediction_analyzer/predict-stock-quotes/data/' 
file_path = path_data + 'news_dump_predicted.csv'
news_dump_predicted = pd.read_csv(file_path,  dtype=str)

In [None]:
print(news_dump_predicted.shape[0])
news_dump_predicted.head(1)

In [None]:
news_dump_predicted['text2'] = news_dump_predicted['text2'].astype('str')
x_test2 = news_dump_predicted['text2'].values
x_test_seq2 = get_sequences(tokenizer, x_test2)

In [None]:
predict_proba_tensor = model(x_test_seq2)
predict_proba = predict_proba_tensor.numpy()

In [None]:
news_dump_predicted['proba_cnn'] = predict_proba.tolist()

In [None]:
news_dump_predicted['label_cnn'] = news_dump_predicted['proba_cnn'].apply(lambda x: np.argmax(x,axis=0))
news_dump_predicted['sentiment_cnn'] =news_dump_predicted['label_cnn'].replace(replaced_label_finbert)

In [None]:
news_dump_predicted.drop(columns=['proba_cnn'],inplace=True)

In [None]:
news_dump_predicted.head(2)

In [None]:
import csv
path_data = '/mnt/files/workdata/work/python-scripts/prediction_analyzer/predict-stock-quotes/data/' 
file_path = path_data + 'news_dump_predicted.csv'
# news_dump_predicted.to_csv(file_path, index=False, quoting=csv.QUOTE_ALL)

In [None]:
# compare results models
print(classification_report(news_dump_predicted['label_sgd'].astype(int), news_dump_predicted['label_cnn'] , digits=5))

In [None]:
path_data = '/mnt/files/workdata/work/python-scripts/prediction_analyzer/predict-stock-quotes/data/' 
file_path = path_data + 'news_dump_predicted.csv'
news_dump_predicted = pd.read_csv(file_path,  dtype=str)

In [None]:
news_dump_predicted.text2 = np.where(news_dump_predicted.text2.isnull(), ' ', news_dump_predicted.text2)

In [None]:
news_dump_predicted.text2 = news_dump_predicted.text2.apply(lambda x: x[:500])      

In [None]:
col_name = '\ttext\tlabel'
news_dump_predicted[col_name] = news_dump_predicted.index.astype('str') + '\t' + news_dump_predicted.text2 + '\t' + 'neutral'
news_dump_predicted[col_name] = news_dump_predicted[col_name].astype('str')

In [None]:
import csv
path_data_test = '/mnt/files/workdata/work/python-scripts/prediction_analyzer/predict-stock-quotes/data/' 
file_path = path_data_test + 'test_news_for_finbert.csv'
news_dump_predicted[[col_name]].to_csv(file_path,index=False, quoting=csv.QUOTE_NONE)

In [None]:
# name_best_sentiment_model = 'keras_cnn_' + research_sentiment+ '_model'
# model.save('models/tmp/' + name_best_sentiment_model) 

In [None]:
name_best_sentiment_model

In [None]:
from keras.models import load_model

model2 = load_model('models/tmp/' + name_best_sentiment_model)

In [None]:
predict_proba = model2.predict(x_test_seq)
test['predict_proba'] = predict_proba.tolist()
test['predict_label'] = test.predict_proba.apply(lambda x: np.argmax(x,axis=0))
print(classification_report(y_test, test['predict_label'] , digits=5))

In [None]:
stop

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

print('Research sentiment {}'.format(research_sentiment))

# threshhold = 50
# threshhold = 100 - Counter(train['label'])[1] / len( train['label']) * 100
predict_proba = model.predict(x_test_seq)

# predicted = np.where(predict_proba > np.percentile(predict_proba, threshhold), 1, 0)
print(classification_report(y_test, predicted, digits=5))

mcc = matthews_corrcoef(y_test, predicted)
print('matthews_corrcoef =  {:04.4f}'.format(mcc))

In [None]:
stop

In [None]:
def cross_val_check(count_segmentations=5):
    evaluate_results = []
    for _ in range(count_segmentations):
        tweet_input = Input(shape=(SENTENCE_LENGTH,), dtype='int32')
        tweet_encoder = Embedding(NUM, DIM, input_length=SENTENCE_LENGTH, weights=[embedding_matrix], trainable=False)(tweet_input)

        branches = []
        x = Dropout(0.2)(tweet_encoder)

        for size, filters_count in [(2, 10), (3, 10), (4, 10), (5, 10), (6,10), (7,10)]: 
            for i in range(filters_count):
                # Добавляем слой свертки
                branch = Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(x)
                branch = GlobalMaxPooling1D()(branch)
                branches.append(branch)

        x = concatenate(branches, axis=1)
        x = Dropout(0.2)(x)
        x = Dense(30, activation='relu')(x)
        x = Dense(last_dence)(x)
        output = Activation(last_activation)(x) 

        model = Model(inputs=[tweet_input], outputs=[output])
        model.compile(loss=loss_func, optimizer='adam', metrics=[matthews_correlation])
        model._name = research_sentiment

        sample_weight = np.ones(shape=(len(y_train),))
        sample_weight[y_train == 1] = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

        checkpoint = ModelCheckpoint("models/cnn/cnn-frozen-embed-{}".format(model.name) + "-{epoch:02d}-{val_matthews_correlation:.4f}.hdf5",
                                     monitor='val_matthews_correlation', save_best_only=True, mode='max', period=1)

#         history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs1, validation_split=0.25, sample_weight=sample_weight, 
#                             callbacks = [checkpoint], verbose=0)
        history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs1, validation_split=0.25, class_weight=class_weight, 
                            callbacks = [checkpoint], verbose=0)

        model.layers[1].trainable = True
        adam = optimizers.Adam(lr=0.0001)
        model.compile(loss=loss_func, optimizer=adam, metrics=[matthews_correlation])

        checkpoint = ModelCheckpoint("models/cnn/loop-{}-cnn-frozen-embed-{}".format(i, model.name) + "-{epoch:02d}-{val_matthews_correlation:.4f}.hdf5", 
                                     monitor='val_matthews_correlation', save_best_only=True, mode='max', period=1)

#         history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs2, validation_split=0.25, shuffle=True, callbacks = [checkpoint], verbose=0) # sample_weight=sample_weight,
        
        history = model.fit(x_train_seq, y_train, batch_size=32, epochs=epochs1, validation_split=0.25, class_weight=class_weight, shuffle=True, 
                            callbacks = [checkpoint], verbose=0)
        
        result = model.evaluate(x_test_seq, y_test, verbose=0)
        evaluate_results.append(result[1])
    
    mean_result = sum(evaluate_results)/len(evaluate_results)
    
    print('==================================================')
    print('mean_matthews_correlation = {}'.format(mean_result))
        
    return mean_result

In [None]:
evaluate_results = cross_val_check()

In [None]:
evaluate_results

In [None]:
# evaluate_results - 0.701 epoch2 - 10

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

print('Research sentiment {}'.format(research_sentiment))

# threshhold = 50
threshhold = 100 - Counter(train['label'])[1] / len( train['label']) * 100
predict_proba = model.predict(x_test_seq)

predicted = np.where(predict_proba > np.percentile(predict_proba, threshhold), 1, 0)
print(classification_report(y_test, predicted, digits=5))

mcc = matthews_corrcoef(y_test, predicted)
print('matthews_corrcoef =  {:04.4f}'.format(mcc))

In [None]:
data = {'pred': predict_proba[:,0], 'label': y_test.numpy()}
df_data = pd.DataFrame(data)
df_data.describe()

In [None]:
stop

### Validation testing

### Load best model

In [None]:
model.name

In [None]:
name_best_sentiment_model = 'keras_cnn_' + model.name + '_model.hdf5'
model.save('models/' + name_best_sentiment_model) 

##### Primary model, matthews_corrcoef = 0.5310 with positive label
##### Primary model, matthews_corrcoef = 0.2746 with negative label
##### Primary model, matthews_corrcoef = 0.5801 with neutral label

In [None]:
from keras.models import load_model

model = load_model('models/cnn/' + name_best_sentiment_model, custom_objects={'precision': precision, 'recall':recall, 'f1':f1, 'matthews_correlation':matthews_correlation})
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[precision, recall, f1, matthews_correlation]) 

In [None]:
predict_proba = model.predict(x_test_seq)
test['predict_proba'] = predict_proba.tolist()
test['predict_label'] = test.predict_proba.apply(lambda x: np.argmax(x,axis=0))
print(classification_report(y_test, test['predict_label'] , digits=5))

In [None]:
# threshhold = 100 - Counter(train['label'])[1] / len( train['label']) * 100
# predicted = model.predict(x_test_seq)

# predicted = np.where(predicted > np.percentile(predicted, threshhold) , 1, 0)
# print(classification_report(y_test, predicted, digits=5))

# print(Counter(predicted[:,0]), '\n')
# print('matthews_corrcoef =  {:04.4f}'.format(matthews_corrcoef(y_test, predicted)))

### Keras set up hyperparameters - talos

In [None]:
del model

### Hyperparametrs research

In [None]:
import talos
from talos.utils import hidden_layers

In [None]:
# Fine-Tuning
# Заморозить все слои предварительно обученной модели. Добавить свои слои к обученной модели. 
# Обучить добавленные слои. Разморозить несколько верхних слоев. Обучить эти слои и добавленную часть вместе.

In [None]:
def sentiment_model(x_train, y_train, params, checkpoint): 
       
    branches = []
    x = Dropout(params['dropout'][0])(tweet_encoder)

    for size, filters_count in [(2, 10), (3, 10), (4, 10), (5, 10), (6,10), (7,10)]: 
        for i in range(filters_count):
            branch = Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(x)
            branch = GlobalMaxPooling1D()(branch)
            branches.append(branch)

    x = concatenate(branches, axis=1)
    x = Dropout(params['dropout'][0])(x)
    x = Dense(params['first_neuron'][0], activation=params['activation'][0])(x)
    x = Dense(1)(x)
    
    output = Activation(params['last_activation'][0])(x)

    model = Model(inputs=[tweet_input], outputs=[output])
    
    model.compile(loss=params['losses'][0], optimizer=params['optimizer'][1], metrics=[matthews_correlation])
    
    history = model.fit(x_train, y_train, 
                        validation_split=0.25,
                        batch_size=params['batch_size'][1],
                        epochs=params['epochs'][0],
                        verbose=1,
                        callbacks = [checkpoint]
                       )

    return history, model

In [None]:
p = {
     'lr': [0.0001],    
     'activation':['relu', 'elu'],
     'optimizer': ['Nadam', 'Adam'],   
     'losses': ['binary_crossentropy', 'logcosh'],
     'shapes': ['brick', 'long_funnel'], # <<< required
     'first_neuron': [32, 64],     # <<< required
     'hidden_layers':[1, 2, 3],    # <<< required
     'dropout': [.2, .3],          # <<< required
     'batch_size': [20, 30, 40],
     'epochs': (10, 40, 10),
     'last_activation': ['sigmoid']
      }

In [None]:
# p = {'lr': (0.8, 1.2, 3),
#      'first_neuron':[4, 8, 16, 32, 64],
#      'hidden_layers':[0, 1, 2],
#      'batch_size': (1, 5, 5),
#      'epochs': [50, 100, 150],
#      'dropout': (0, 0.2, 3),
#      'weight_regulizer':[None],
#      'emb_output_dims': [None],
#      'shape':['brick','long_funnel'],
#      'kernel_initializer': ['uniform','normal'],
#      'optimizer': [Adam, Nadam, RMSprop],
#      'losses': [binary_crossentropy],
#      'activation':[relu, elu],
#      'last_activation': [sigmoid]}

In [None]:
talos_checkpoint = ModelCheckpoint("models/cnn/for-talos-cnn-frozen-embed-{}".format(research_sentiment) + "-{epoch:02d}-{val_matthews_correlation:.4f}.hdf5",
                             monitor='val_matthews_correlation', save_best_only=True, mode='max', period=1)

history_talos = sentiment_model(x_train_seq, y_train, p, talos_checkpoint) 

In [None]:
model = history_talos[1]

In [None]:
threshhold = 100 - Counter(train['label'])[1] / len( train['label']) * 100
predicted = model.predict(x_test_seq)

predicted = np.where(predicted > np.percentile(predicted, threshhold) , 1, 0)
print(classification_report(y_test, predicted, digits=5))

print(Counter(predicted[:,0]), '\n')
print('matthews_corrcoef =  {:04.4f}'.format(matthews_corrcoef(y_test, predicted)))

In [None]:
# model.summary()

In [None]:
# plot_model(model)

In [None]:
x = Dropout(params['dropout'][0])(tweet_encoder)

    x = concatenate(branches, axis=1)
    x = Dropout(params['dropout'][0])(x)
    x = Dense(params['first_neuron'][0], activation=params['activation'][0])(x)
    x = Dense(1)(x)
    
    output = Activation(params['last_activation'][0])(x)

    model = Model(inputs=[tweet_input], outputs=[output])
    
    model.compile(loss=params['losses'][0], optimizer=params['optimizer'][1], metrics=[matthews_correlation])

In [None]:
model.ad

In [None]:
def reserch_model_hyperparams(x_train, y_train, x_val, y_val, params, model=model):

    model.layers[1].trainable = True

    
    hidden_layers(model, params, 1)

    adam = optimizers.Adam(lr=params['lr'])

    model.compile(loss=params['losses'], optimizer=params['optimizer'], metrics=[matthews_correlation])
    
    history = model.fit(x_train, y_train, 
                        validation_split=0.25,
                        batch_size=params['batch_size'],
                        epochs=params['epochs'],
                        verbose=0,
                        )

    return history, model

In [None]:
# for i in  range(10):
#     checkpoint = ModelCheckpoint("models/cnn/talos_best_model-{}".format(i, model.name) + "-{epoch:02d}-{val_matthews_correlation:.4f}.hdf5", 
#                                  monitor='val_matthews_correlation', save_best_only=True, mode='max', period=1)
#     history = model.fit(x_train_seq, y_train, batch_size=, epochs=EPOCHS, validation_split=0.25, callbacks = [checkpoint])

In [None]:
#  ta_params = dict()  
#  'kernel_initializer': ['uniform','normal']}
#   optimizer - 'ftrl' лучше всего работает с разреженными данными, такими как ввод языка.

In [None]:
talos_scan = talos.Scan(x=x_train_seq, y=y_train.numpy(), model=reserch_model_hyperparams, params=p, 
                          reduction_metric='val_matthews_correlation', experiment_name='neutral_sentiment', 
                          fraction_limit=0.05, seed=123)

In [None]:
talos_scan.data.sort_values(by='val_matthews_correlation', ascending=False).head(10)

In [None]:
talos_scan.details

In [None]:
talos_scan.evaluate_models(x_val=x_train_seq,
                            y_val=y_train.numpy(),
                            n_models=10,
                            metric='matthews_correlation',
                            folds=10,
                            shuffle=True,
                            task='binary',
                            asc=False)

In [None]:
talos_scan.data.sort_values(by='eval_f1score_std', ascending=True).head(10)

In [None]:
talos_best_model = talos_scan.best_model(metric='eval_f1score_std', asc=True)

In [None]:
threshhold = 100 - Counter(train['label'])[1] / len( train['label']) * 100
predicted = talos_best_model.predict(x_test_seq)

predicted = np.where(predicted > np.percentile(predicted, threshhold) , 1, 0)
print(classification_report(y_test, predicted, digits=5))

print(Counter(predicted[:,0]), '\n')
print('matthews_corrcoef =  {:04.4f}'.format(matthews_corrcoef(y_test, predicted)))

In [None]:
# Visualize results distribution
sns.jointplot(x="val_acc", y="val_categorical_crossentropy", data=df_results);
sns.jointplot(x="categorical_crossentropy", y="val_categorical_crossentropy", data=df_results);