In [None]:
import nltk
import openpyxl
import pymorphy3
import json
import numpy as np
import pandas as pd
import datetime
import sklearn

In [None]:
def getDateOnly(date):
  return datetime.date(date.year, date.month, date.day)
def getDateOnlyFromString(date_string):
  return getDateOnly(datetime.datetime.strptime(str(date_string), '%Y-%m-%d').date())
def getDateDiffs(dates):
  dates.values.tolist()
  result = []
  for i in range(1, len(dates)):
    result.append((getDateOnly(dates[i]) - getDateOnly(dates[i-1])).days)
  df = pd.DataFrame(result, columns=['DATE_DIFF'])
  return df.groupby(['DATE_DIFF'])['DATE_DIFF'].count()

In [None]:
market_path = 'market_data/sber_FULL.csv'

In [None]:
sber = pd.read_csv(market_path) 
sber['DATE'] = sber['DATE'].apply(lambda x: getDateOnlyFromString(x))
sber

Unnamed: 0,TICKER,DATE,OPEN,HIGH,MID,LOW,CLOSE,VOL
0,SBER,2000-04-03,1422.000000,1422.000000,1396.000000,1370.000000,1385.00,16938
1,SBER,2000-04-04,1390.000000,1546.000000,1460.500000,1375.000000,1521.00,84058
2,SBER,2000-04-05,1475.100000,1510.000000,1448.750000,1387.500000,1398.00,48853
3,SBER,2000-04-06,1410.000000,1462.990000,1436.495000,1410.000000,1425.01,23364
4,SBER,2000-04-07,1416.000000,1430.000000,1407.500000,1385.000000,1416.00,19315
...,...,...,...,...,...,...,...,...
8404,SBER,2023-04-07,214.500000,216.390000,215.020000,213.650000,216.27,29157060
8405,SBER,2023-04-08,215.366667,218.346667,216.491667,214.636667,218.25,47757683
8406,SBER,2023-04-09,216.233333,220.303333,217.963333,215.623333,220.23,66358306
8407,SBER,2023-04-10,217.100000,222.260000,219.435000,216.610000,222.21,84958930


In [None]:
sber_Y = pd.read_csv('market_data/sber_Y.csv')
sber_Y

Unnamed: 0,DATE,Y
0,2000-04-04,0.046203
1,2000-04-05,-0.008045
2,2000-04-06,-0.008459
3,2000-04-07,-0.020185
4,2000-04-08,0.015986
...,...,...
8403,2023-04-07,-0.003268
8404,2023-04-08,0.006844
8405,2023-04-09,0.006798
8406,2023-04-10,0.006752


In [None]:
sber_Y['Y'] = sber_Y['Y'].apply(lambda x: x>0)

In [None]:
#sber_Y['Y'] = sber_Y['Y'].apply(lambda x: x*100)

In [None]:
sber_no_Date = sber.drop(['DATE', 'TICKER'], axis=1).drop(index=[0]).reset_index(drop=True)
sber_no_Date

Unnamed: 0,OPEN,HIGH,MID,LOW,CLOSE,VOL
0,1390.000000,1546.000000,1460.500000,1375.000000,1521.000000,84058
1,1475.100000,1510.000000,1448.750000,1387.500000,1398.000000,48853
2,1410.000000,1462.990000,1436.495000,1410.000000,1425.010000,23364
3,1416.000000,1430.000000,1407.500000,1385.000000,1416.000000,19315
4,1410.666667,1470.000000,1430.000000,1390.000000,1460.666667,26589
...,...,...,...,...,...,...
8403,214.500000,216.390000,215.020000,213.650000,216.270000,29157060
8404,215.366667,218.346667,216.491667,214.636667,218.250000,47757683
8405,216.233333,220.303333,217.963333,215.623333,220.230000,66358306
8406,217.100000,222.260000,219.435000,216.610000,222.210000,84958930


In [None]:
start = 4000
end = 6000

In [None]:
dataset_X = sber_no_Date.iloc[start:end]
dataset_Y = sber_Y['Y'].iloc[start:end]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset_X, 
                                                    dataset_Y, 
                                                    test_size=0.2, 
                                                    random_state=777)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier
from sklearn import metrics

In [None]:
models = [LogisticRegression(), SVR(), SVC(), XGBClassifier()]
 
for i in range(3):
  models[i].fit(X_train, y_train)

  print(f'{models[i]} : ')
  print('Training Accuracy : ', metrics.roc_auc_score(
    y_train, models[i].predict(X_train)))
  print('Validation Accuracy : ', metrics.roc_auc_score(
    y_test, models[i].predict(X_test)))
  print()

LogisticRegression() : 
Training Accuracy :  0.5
Validation Accuracy :  0.5

SVR() : 
Training Accuracy :  0.5398812901450811
Validation Accuracy :  0.47002200220022006

SVC() : 
Training Accuracy :  0.5096770260320675
Validation Accuracy :  0.47174717471747185



In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Reshape
from tensorflow.keras.layers import LSTM
from keras.layers import LeakyReLU

In [None]:
X_train.shape

(1600, 6)

In [None]:
# from keras.layers import LeakyReLU
# model = Sequential()
# model.add(Dense(6, kernel_initializer ='glorot_uniform',input_dim=X_train.shape[1]))
# model.add(LeakyReLU(alpha=0.01))
# model.add(Dropout(0.20))
# model.add(Dense(8, kernel_initializer ='glorot_uniform'))
# model.add(LeakyReLU(alpha=0.01))
# model.add(Dropout(0.20))
# # model.add(LSTM(4))
# model.add(Dropout(0.20))
# model.add(Dense(4))

# model.add(Dense(1, kernel_initializer ='glorot_uniform', activation = 'sigmoid'))
# model.compile(loss='binary_crossentropy',
#               optimizer='adamax',
#               metrics=['accuracy',f1_m,precision_m, recall_m])

In [None]:
from keras.layers import LeakyReLU
model = Sequential()
#model.add(Dense(6,input_dim=X_train.shape[1], activation='relu'))
model.add(Input(shape=(X_train.shape[1],)))
#model.add(Reshape((1,X_train.shape[1])))
#model.add(LSTM(2, input_shape=(1, X_train.shape[1]), return_sequences=False))
model.add(Dense(6, activation='relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy',f1_m,precision_m, recall_m])
es = tf.keras.callbacks.EarlyStopping(monitor='val_f1_m', min_delta=0, patience=3, verbose=0, mode='auto', baseline=None, restore_best_weights=False)

In [None]:
# model = Sequential()
# model.add(Dense(6, kernel_initializer ='glorot_uniform',input_dim=X_train.shape[1]))
# model.add(LeakyReLU(alpha=0.01))
# model.add(Dropout(0.20))
# model.add(Dense(1, kernel_initializer ='glorot_uniform'))
# model.add(LeakyReLU(alpha=0.01))
# model.add(Dropout(0.20))
# model.add(Dense(1, kernel_initializer ='glorot_uniform', activation = 'sigmoid'))
# model.compile(loss='binary_crossentropy',
#               optimizer='adamax',
#               metrics=['acc',f1_m,precision_m, recall_m])

#es = tf.keras.callbacks.EarlyStopping(monitor='val_f1_m', min_delta=0, patience=3, verbose=0, mode='auto', baseline=None, restore_best_weights=False)

In [None]:
model.summary()

Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_53 (Dense)            (None, 6)                 42        
                                                                 
 dense_54 (Dense)            (None, 1)                 7         
                                                                 
Total params: 49
Trainable params: 49
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size = 4, epochs = 40, callbacks=[es])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1f48004e7f0>

In [None]:
y_pred = model.predict(X_test)



In [None]:
y_pred = np.array([i>0.5 for i in y_pred.flatten()])
y_pred

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [None]:
from sklearn.metrics import f1_score, precision_score,recall_score, precision_recall_curve

In [None]:
f1_score(y_test, y_pred)

0.6622073578595317

In [None]:
precision_score(y_test, y_pred)

0.495

In [None]:
recall_score(y_test, y_pred)

1.0