In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import math

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Activation
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.callbacks import ModelCheckpoint, EarlyStopping

%matplotlib inline

Using TensorFlow backend.


# Loading Dataset

In [2]:
stock_exchange_mk = pd.read_csv('data_mse_mse_historic_data_active.csv', sep='\t', parse_dates=['Date'])

In [3]:
df = pd.DataFrame(stock_exchange_mk)
df["stock_id"] = df["stock_id"].astype('category')
df.columns = df.columns.str.lower()

# Missing Dates

In [4]:
stocks = df.stock_id.unique()
dates_all = df.set_index('date').index
idx = pd.MultiIndex.from_product((dates_all, stocks), names=['date', 'stock_id'])
df_new = df.set_index(['date', 'stock_id']).reindex(idx, fill_value=0).reset_index()

DROP UNNECESSARY DATES

In [6]:
df_list = []
for stock in df.stock_id.unique():
    each_df = df[df.stock_id == stock]
    groupby_stock = each_df.groupby([pd.PeriodIndex(data=each_df.date, freq='D'), 'stock_id'])
    df_list.append(groupby_stock)


In [15]:
stock_name = []
stock_start_date = []
for i in df_list:
    name = min(i.date)[0][1]
    stock_name.append(name)
    date = min(i.date)[0][0]
    stock_start_date.append(date)
    
start_date = dict(zip(stock_name, stock_start_date))

In [17]:
start_date_df = pd.DataFrame.from_dict(start_date, orient='index', dtype='datetime64[ns]', columns=['start_date'])

In [19]:
start_date_df.head()

Unnamed: 0,start_date
ALK,1997-01-09
BESK,2002-10-10
FERS,1999-04-15
GRNT,1999-10-28
KMB,1997-05-15


In [20]:
df_clean = df_new.set_index('stock_id').join(start_date_df, lsuffix='_filter', on='stock_id')

In [29]:
print(df_clean.shape, df_new.shape, df.shape)

(955702, 19) (955702, 19) (43441, 19)


In [32]:
df_drop = df_clean[df_clean['date'] > df_clean['start_date']].reset_index()

In [33]:
df_drop

Unnamed: 0,stock_id,date,open,high,low,close,volume,adj close,quantity,average,change %,volume total,ratio,ratio 1m,ratio 3m,ratio 6m,ratio 1y,ratio 2y,ratio 3y,start_date
0,ALK,2020-08-25,12290.0,12222.0,12200.0,12200.0,3660440,12200.0,300,12201.47,-0.72,3660440,0.878,0.971,0.971,0.937,0.916,0.916,0.916,1997-01-09
1,BESK,2020-08-25,0.0,0.0,0.0,0.0,0,0.0,0,0.00,0.00,0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2002-10-10
2,FERS,2020-08-25,0.0,0.0,0.0,0.0,0,0.0,0,0.00,0.00,0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1999-04-15
3,GRNT,2020-08-25,0.0,0.0,0.0,0.0,0,0.0,0,0.00,0.00,0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1999-10-28
4,KMB,2020-08-25,6250.0,6300.0,6250.0,6300.0,771074,6300.0,123,6268.89,0.46,771074,0.702,1.002,1.002,0.802,0.802,0.802,0.802,1997-05-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933295,SBT,1998-10-13,0.0,0.0,0.0,0.0,0,0.0,0,0.00,0.00,0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1998-05-26
933296,STIL,1998-10-13,0.0,0.0,0.0,0.0,0,0.0,0,0.00,0.00,0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1997-12-16
933297,TNB,1998-10-13,0.0,0.0,0.0,0.0,0,0.0,0,0.00,0.00,0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1998-06-30
933298,UNI,1998-10-13,0.0,0.0,0.0,0.0,0,0.0,0,0.00,0.00,0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1998-01-29
