<a href="https://colab.research.google.com/github/Tahahaha7/Trading_WallStreetBets_Sentiments/blob/main/Quantitative_Trading_with_WSB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## $\text{Dependencies}$

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# changing the working directory
%cd /content/gdrive/My Drive/QT_final

In [None]:
%%capture
!pip install psaw
!pip install bayesian-optimization
!pip install yfinance
!pip install --upgrade nltk
!pip install stanza

In [None]:
# API, Data, Utilities packages
import psaw; from psaw import PushshiftAPI
from wordcloud import WordCloud, STOPWORDS
import datetime
import pandas as pd
import numpy as np
import regex as re
import networkx as nx
import yfinance as yf

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Sentiment Analysis Packages
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
# Plotting Packages
import matplotlib
import seaborn as sns
from matplotlib import rc
import matplotlib.pyplot as plt
%matplotlib inline

rc('text', usetex=True)
matplotlib.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}']

In [None]:
%%capture
! sudo apt-get install texlive-latex-recommended #1
! sudo apt-get install dvipng texlive-fonts-recommended #2
! wget http://mirrors.ctan.org/macros/latex/contrib/type1cm.zip #3
! unzip type1cm.zip -d /tmp/type1cm #4
! cd /tmp/type1cm/type1cm/ && sudo latex type1cm.ins  #5
! sudo mkdir /usr/share/texmf/tex/latex/type1cm #6
! sudo cp /tmp/type1cm/type1cm/type1cm.sty /usr/share/texmf/tex/latex/type1cm #7
! sudo texhash #8

In [None]:
%%capture
!apt install texlive-fonts-recommended texlive-fonts-extra cm-super dvipng --fix-missing

## $\text{Scraping Reddit Group}$

In [None]:
api = PushshiftAPI()

### $\text{r/wallstreetbest submissions}$

In [None]:
'''
THE LIST OF ATTRIBUTES RETURNED BY THE POSTS API CALL

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_metadata',
       'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'suggested_sort', 'thumbnail', 'thumbnail_height',
       'thumbnail_width', 'title', 'total_awards_received', 'treatment_tags',
       'upvote_ratio', 'url', 'whitelist_status', 'wls', 'created', 'd_'],
      dtype='object')
''';

start_time = int(datetime.datetime(2021, 1, 1).timestamp())

submissions = api.search_submissions(after=start_time,
                                     subreddit='wallstreetbets', 
                                     filter=['url', 'author', 'title', 'upvote_ratio', 'num_comments', 'score', 'subreddit'])

# COLUMNS TO KEEP: ['url', 'author', 'title', 'upvote_ratio', 'num_comments', 'score', 'subreddit']
pd.DataFrame(list(submissions)).to_csv('wallstreetbets.csv', index=False)

In [None]:
data = pd.read_csv('wallstreetbets.csv')
data.insert(2, 'time', pd.to_datetime(data.created_utc, unit='s'))

In [None]:
content = ' '.join(map(str, data.title))

In [None]:
tickers = re.findall(r'[$][A-Za-z][\S]*', content)
tickers = ''.join(tickers)
regex = re.compile('[^a-zA-Z]')
tickers = regex.sub(' ', tickers).upper()
tickers = tickers.split()

In [None]:
top_stocks = pd.DataFrame(tickers).value_counts().rename_axis('Stock').reset_index(name='Counts')
top_stocks.head(11)

### $\text{r/wallstreetbets comments}$

In [None]:
'''
Based on the mentions of stock tickers in the submissions above
This function looks up all the comments mentioning the top 10 stocks
Each stock has its own dataset of compiled comments
The output is stored in the directory as a csv file
'''

for stock in top_stocks.Stock[1:11].values:
    print('Starting {}'.format(stock))
    gen = api.search_comments(q=stock, subreddit='wallstreetbets')
    thing = next(gen)
    comments = pd.DataFrame([thing.d_ for thing in gen])
    comments.to_csv(stock.lower()+'_comments.csv', index=False)
    print('Finishing {}'.format(stock))

In [None]:
'''
THE LIST OF ATTRIBUTES RETURNED BY THE COMMENTS API CALL

Index(['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'id', 'is_submitter', 'link_id', 'locked',
       'no_follow', 'parent_id', 'permalink', 'retrieved_on', 'score',
       'send_replies', 'stickied', 'subreddit', 'subreddit_id',
       'top_awarded_type', 'total_awards_received', 'treatment_tags',
       'created', 'author_cakeday', 'media_metadata', 'distinguished',
       'edited', 'steward_reports', 'updated_utc', 'author_created_utc',
       'can_gild', 'collapsed', 'collapsed_reason', 'controversiality',
       'gilded', 'nest_level', 'reply_delay', 'subreddit_name_prefixed',
       'subreddit_type', 'score_hidden', 'rte_mode'],
       dtype='object')
''';

# COLUMNS TO KEEP: ['created_utc', 'author', 'is_submitter', 'body', 'score']
gme_comments = pd.read_csv('gme_comments.csv')
gme_comments = gme_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
gme_comments.insert(1, 'time', pd.to_datetime(gme_comments.created_utc, unit='s'))

In [None]:
gme_comments['stock'] = ['GME']*len(gme_comments)
gme_comments

In [None]:

amc_comments = pd.read_csv('amc_comments.csv')
amc_comments = amc_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
amc_comments.insert(1, 'time', pd.to_datetime(amc_comments.created_utc, unit='s'))
amc_comments['stock'] = ['AMC']*len(amc_comments)

bb_comments = pd.read_csv('bb_comments.csv')
bb_comments = bb_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
bb_comments.insert(1, 'time', pd.to_datetime(bb_comments.created_utc, unit='s'))
bb_comments['stock'] = ['BB']*len(bb_comments)

nok_comments = pd.read_csv('nok_comments.csv')
nok_comments = nok_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
nok_comments.insert(1, 'time', pd.to_datetime(nok_comments.created_utc, unit='s'))
nok_comments['stock'] = ['NOK']*len(nok_comments)

sndl_comments = pd.read_csv('sndl_comments.csv')
sndl_comments = sndl_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
sndl_comments.insert(1, 'time', pd.to_datetime(sndl_comments.created_utc, unit='s'))
sndl_comments['stock'] = ['SNDL']*len(sndl_comments)

nakd_comments = pd.read_csv('nakd_comments.csv')
nakd_comments = nakd_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
nakd_comments.insert(1, 'time', pd.to_datetime(nakd_comments.created_utc, unit='s'))
nakd_comments['stock'] = ['NAKD']*len(nakd_comments)

slv_comments = pd.read_csv('slv_comments.csv')
slv_comments = slv_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
slv_comments.insert(1, 'time', pd.to_datetime(slv_comments.created_utc, unit='s'))
slv_comments['stock'] = ['SLV']*len(slv_comments)

pltr_comments = pd.read_csv('pltr_comments.csv')
pltr_comments = pltr_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
pltr_comments.insert(1, 'time', pd.to_datetime(pltr_comments.created_utc, unit='s'))
pltr_comments['stock'] = ['PLTR']*len(pltr_comments)

doge_comments = pd.read_csv('doge_comments.csv')
doge_comments = doge_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
doge_comments.insert(1, 'time', pd.to_datetime(doge_comments.created_utc, unit='s'))
doge_comments['stock'] = ['DOGE']*len(doge_comments)

rkt_comments = pd.read_csv('rkt_comments.csv')
rkt_comments = rkt_comments[['created_utc', 'author', 'is_submitter', 'body', 'score']]
rkt_comments.insert(1, 'time', pd.to_datetime(rkt_comments.created_utc, unit='s'))
rkt_comments['stock'] = ['RKT']*len(rkt_comments)

In [None]:
all_comments = pd.concat([gme_comments, amc_comments, bb_comments, nok_comments, sndl_comments, 
                          nakd_comments, slv_comments, pltr_comments, doge_comments, rkt_comments])

all_comments = all_comments.sort_values(by='created_utc').reset_index(drop=True)

In [None]:
all_comments

In [None]:
all_comments.isna().sum()

In [None]:
import matplotlib as mpl
x = list(map(int, all_comments.is_submitter.dropna().values[:1795600]))
x = np.array(x).reshape(1340, 1340)
plt.figure(figsize=(6, 6))
plt.imshow(x, interpolation='none', cmap='cividis'); plt.axis('off');
# The ticker for being a submitter seems fairly spaced.

## $\text{Exploratory Data Analysis}$

### $\text{r/wallstreetbest submissions}$

In [None]:
# Create a word cloud for the group
# Identify the authors with most influence
# Classify the tickers for each hour

pd.DataFrame(data.author.value_counts()).head(10)

In [None]:
keep = ['num_comments', 'score', 'upvote_ratio']
data.groupby('author').sum().sort_values(by='num_comments', ascending=False)[keep].head(10)

In [None]:
content = ' '.join(map(str, data.title))

In [None]:
# WordCloud of all publications

wordcloud = WordCloud(font_path='Arsenal-Regular.ttf',
                      width = 600, height = 400,
                      background_color ='white',
                      stopwords = set(STOPWORDS),
                      collocations=False,
                      min_font_size = 10).generate(content) 
                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud); plt.axis("off")
plt.tight_layout(pad = 0);

In [None]:
wordcloud = WordCloud(font_path='Arsenal-Regular.ttf', 
                      width = 600, height = 400, max_words=80,
                      background_color ='white',
                      collocations=False, contour_width=50,
                      min_font_size = 10).generate(' '.join(tickers))
                      
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud); plt.axis("off") 
plt.tight_layout(pad = 0);

### $\text{r/wallstreetbest comments}$

In [None]:
'''
THE LOGIC: If the comment's author is a submitter, then the next comments are considered to be related to that submission
as long as the they're not submitters. Once we encounter an author who's a submitter, then we switch to the new submission
(following the arrow of time)
''';
edges = []
current_author = np.nan
for idx, i in enumerate(all_comments.author):
    if all_comments.is_submitter[idx]: 
        current_author = all_comments.author[idx]
    edges.append((i , current_author))

In [None]:
graph = nx.from_edgelist(edges, create_using=nx.DiGraph)
print(nx.info(graph))

In [None]:
PageRank = pd.DataFrame(nx.pagerank(graph).items(), columns=['author', 'rank'])

In [None]:
PageRank['clustering'] = nx.clustering(graph).values()

In [None]:
PageRank.sort_values(by='rank', ascending=False).head(10)

## $\text{Sentiment Analysis}$

In [None]:
import stanza
stanza.download('en') # download English model

nlp = stanza.Pipeline('en') # initialize English neural pipeline
#nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

doc = nlp("buy AAPL stock")
print(*[f'token: {token.text}\tner: {token.ner}' for sent in doc.sentences for token in sent.tokens], sep='\n')

In [None]:
'''
Adding new words widely used in r/wallstreetbets to improve vader's sentiment analysis
Source: Julien Klepatch @jklepatch (https://github.com/jklepatch)
''';

wsb_words = {
    'citron': -4.0,
    'hidenburg': -4.0,
    'moon': 4.0,
    'highs': 2.0,
    'mooning': 4.0,
    'long': 2.0,
    'short': -2.0,
    'call': 4.0,
    'calls': 4.0,
    'put': -4.0,
    'puts': -4.0,
    'break': 2.0,
    'tendie': 2.0,
    'tendies': 2.0,
    'town': 2.0,
    'overvalued': -3.0,
    'undervalued': 3.0,
    'buy': 4.0,
    'sell': -4.0,
    'gone': -1.0,
    'gtfo': -1.7,
    'paper': -1.7,
    'bullish': 3.7,
    'bearish': -3.7,
    'bagholder': -1.7,
    'stonk': 1.9,
    'green': 1.9,
    'money': 1.2,
    'print': 2.2,
    'rocket': 2.2,
    'bull': 2.9,
    'bear': -2.9,
    'pumping': -1.0,
    'sus': -3.0,
    'offering': -2.3,
    'rip': -4.0,
    'downgrade': -3.0,
    'upgrade': 3.0,     
    'maintain': 1.0,          
    'pump': 1.9,
    'hot': 1.5,
    'drop': -2.5,
    'rebound': 1.5,  
    'crack': 2.5,}

In [None]:
# Create a sentiment for GME for every hour of 2021 (make it generalizable to other stocks)
vader = SentimentIntensityAnalyzer()
vader.lexicon.update(wsb_words)

In [None]:
# Keep only the comment from 2021
all_comments_2021 = all_comments[all_comments.time.dt.year == 2021]
all_comments_2021

In [None]:
for stock in top_stocks.Stock[1:10]:
    print('Starting {}'.format(stock))
    # Pick the stock of interest
    store = all_comments_2021[all_comments_2021.stock == stock]

    # Combining all the comments on WSB for every half an hour since Jan 1st 2021 to Apr 4th 2021
    agg_comments_2021 = store[['time', 'body']].groupby(pd.Grouper(key='time', freq='30min'))['body'].apply(lambda x: ' '.join(x)).reset_index()
    
    # Sentiment analysis of the comments
    sentiment = agg_comments_2021['body'].apply(vader.polarity_scores)
    processed_data = pd.DataFrame(list(sentiment))
    processed_data['time'] = agg_comments_2021['time']
    processed_data['stock'] = [stock]*len(processed_data)
    processed_data.to_csv(stock.lower()+'_sentiment.csv', index=False)
    print('Finished {}'.format(stock))

In [None]:
gme_sentiment = pd.read_csv('gme_sentiment.csv')
gme_sentiment['stock'] = ['GME']*len(gme_sentiment)
gme_sentiment['time'] = all_comments_2021[all_comments_2021.stock == 'GME'][['time', 'body']]\
                        .groupby(pd.Grouper(key='time', freq='30min'))['body'].apply(lambda x: ' '.join(x)).reset_index()['time']

amc_sentiment = pd.read_csv('amc_sentiment.csv')
#amc_sentiment['stock'] = ['AMC']*len(amc_sentiment)

bb_sentiment = pd.read_csv('bb_sentiment.csv')
#bb_sentiment['stock'] = ['BB']*len(bb_sentiment)

nok_sentiment = pd.read_csv('nok_sentiment.csv')
#nok_sentiment['stock'] = ['NOK']*len(nok_sentiment)

sndl_sentiment = pd.read_csv('sndl_sentiment.csv')
#sndl_sentiment['stock'] = ['SNDL']*len(sndl_sentiment)

nakd_sentiment = pd.read_csv('nakd_sentiment.csv')
#nakd_sentiment['stock'] = ['NAKD']*len(nakd_sentiment)

slv_sentiment = pd.read_csv('slv_sentiment.csv')
#slv_sentiment['stock'] = ['SLV']*len(slv_sentiment)

pltr_sentiment = pd.read_csv('pltr_sentiment.csv')
#pltr_sentiment['stock'] = ['PLTR']*len(pltr_sentiment)

doge_sentiment = pd.read_csv('doge_sentiment.csv')
#doge_sentiment['stock'] = ['DOGE']*len(doge_sentiment)

rkt_sentiment = pd.read_csv('rkt_sentiment.csv')
#rkt_sentiment['stock'] = ['RKT']*len(rkt_sentiment)

In [None]:
all_sentiment = pd.concat([gme_sentiment, amc_sentiment, bb_sentiment, nok_sentiment, 
                           sndl_sentiment, nakd_sentiment, slv_sentiment, pltr_sentiment, 
                           doge_sentiment, rkt_sentiment]).reset_index(drop=True)

all_sentiment

In [None]:
all_sentiment.isna().sum()

In [None]:
plt.figure(figsize=(7, 15))
titles = ['Negative sentiment', 'Neutral sentiment', 'Positive sentiment']
for idx,i in enumerate(all_sentiment.columns[:3]):
    plt.subplot(3, 1, idx+1)
    plt.title(titles[idx], fontsize=20)
    plt.hist(all_sentiment[all_sentiment[i] != 0][i], bins='auto', alpha=.7, density=False)
plt.tight_layout()

## $\text{Hourly Stock Market Data}$

In [None]:
# Get hourly data for top 10 stocks mentioned in the group

for idx, stock in enumerate(top_stocks.Stock[:10]):
    print(idx, stock)
    hourly_data_1 = yf.download(tickers=stock, 
                                start=datetime.datetime(2021, 1, 1),
                                end=datetime.datetime(2021, 2, 9),
                                interval="60m")
    hourly_data_1['stock'] = [stock] * len(hourly_data_1)
    hourly_data_2 = yf.download(tickers=stock, 
                                start=datetime.datetime(2021, 2, 10),
                                end=datetime.datetime(2021, 4, 2),
                                interval="30m")
    hourly_data_2['stock'] = [stock] * len(hourly_data_2)

    if idx == 0:
        hourly_data = pd.concat([hourly_data_1, hourly_data_2])
    else:
        hourly_data = pd.concat([hourly_data, hourly_data_1, hourly_data_2])

In [None]:
hourly_data

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(hourly_data['Adj Close'], 'k.');

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(sentiment['compound'], 'k.');

In [None]:
all_sentiment.index = all_sentiment.time
all_sentiment.index.rename('DateTime', inplace=True)

In [None]:
all_sentiment

In [None]:
# Prob keep only the sentiment during trading hours

hourly_data.index = hourly_data.index.tz_convert('UTC').tz_convert(None)

hourly_data = hourly_data.reset_index()

hourly_data = hourly_data.rename(columns={'index':'time'})

hourly_data

In [None]:
all_sentiment['time'] = pd.to_datetime(all_sentiment['time'])

In [None]:
# Test for coorelation between sentiment and stock data
#full_data = pd.merge(sentiment, hourly_data, left_index=True, right_index=True, how='inner')

full_data = pd.merge(all_sentiment, hourly_data, left_on=['time', 'stock'], right_on=['time', 'stock'], how='inner')

In [None]:
# The data starts with 1h interval then goes to 30min inteval for the last 2 months
# The sentiment can be lagged by one time step to test its predictibility
full_data

In [None]:
plt.figure(figsize=(17, 6))

all_sentiment_plot = all_sentiment[all_sentiment.compound != 0].reset_index(drop=True)

plt.stackplot(all_sentiment_plot.index, all_sentiment_plot.pos, all_sentiment_plot.neu, all_sentiment_plot.neg, 
              labels=['{i}'.format(i=i) for i in ['Negative', 'Neutral', 'Positive']])
plt.legend(loc='best', fontsize=16); plt.xlabel('Time Step', fontsize=20)
plt.ylabel('Sentiment', fontsize=20); plt.xlim(0, 22800); plt.ylim(-.1, 1.1);

## $\text{Trading strategies}$

### $\text{Gradient Boosting: Nested Cross-Validation}$

In [None]:
full_data['stock'] = full_data.stock.astype('category')

In [None]:
full_data.dtypes

In [None]:
encoding = dict( zip( full_data['stock'], full_data['stock'].cat.codes ) )

full_data['stock_encoding'] = [encoding.get(i) for i in full_data.stock]
full_data['stock_encoding'] = full_data['stock_encoding'].astype('category')

In [None]:
full_data.stock_encoding.value_counts()

In [None]:
from sklearn.model_selection import TimeSeriesSplit

data_X = full_data.drop(labels=['compound', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'time', 'stock'], axis=1).values
data_Y = full_data['Adj Close'].pct_change(1).apply(lambda x : 1 if x > 0 else 0).values

In [None]:
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, auc
from sklearn.metrics import confusion_matrix, roc_curve, plot_confusion_matrix

lgb_model = lgb.LGBMClassifier()
tscv = TimeSeriesSplit(n_splits = 7)

for train_index, test_index in tscv.split(data_X):
    X_train, X_test = data_X[train_index], data_X[test_index]
    y_train, y_test = data_Y[train_index], data_Y[test_index]

    lgb_model.fit(X_train, y_train.reshape(len(y_train),), categorical_feature='auto', verbose=True, )
    y_pred = lgb_model.predict(X_test)
    print('Training set score: {:.4f}'.format(lgb_model.score(X_train, y_train)))
    print('Test set score: {:.4f}\n'.format(lgb_model.score(X_test, y_test)))

In [None]:
def plot_cm(cm):
    group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    group_counts = ['{0:0.0f}'.format(value) for value in cm.flatten()]
    group_percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
              zip(group_names, group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.set(rc={'figure.figsize':(8, 6)})
    sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', alpha=.75)

In [None]:
plot_cm(confusion_matrix(y_test, y_pred));

### $\text{Gradient Boosting: Hypterparameter Tuning}$

In [None]:
def lgbm_cv(max_depth, learning_rate, 
            n_estimators, reg_alpha,
            bagging_fraction, 
            min_child_weight, min_split_gain,
            colsample_bytree):
    
    estimator_function = lgb.LGBMClassifier(max_depth=int(max_depth),
                                            learning_rate = learning_rate,
                                            bagging_fraction = bagging_fraction,
                                            min_child_weight = min_child_weight,
                                            min_split_gain = min_split_gain,
                                            colsample_bytree = colsample_bytree,
                                            n_estimators = int(n_estimators),
                                            reg_alpha = reg_alpha, nthread = -1,
                                            objective = 'binary', seed = 42)
    
    estimator_function.fit(X_train, y_train.reshape(len(y_train),))
    probs = estimator_function.predict_proba(X_test)[:,1]
    return roc_auc_score(y_test, probs)

In [None]:
from bayes_opt import BayesianOptimization

gp_params = {"alpha": 1e-10}

hp_space = {
    'max_depth': (5, 30),
    'learning_rate': (.001, 1),
    'n_estimators' : (10, 250),
    'reg_alpha': (0, 1),
    'bagging_fraction': (.8, 1),
    'min_child_weight': (1, 25),
    'min_split_gain': (.001, .1),
    'colsample_bytree': (.1, 1)
}

lgbcBO = BayesianOptimization(f=lgbm_cv, pbounds=hp_space, random_state=42, verbose=10)

In [None]:
lgbcBO.maximize(init_points=5, n_iter=45, acq='ucb', kappa= 3, **gp_params)

In [None]:
optimal_params = lgbcBO.res[np.argmax(pd.DataFrame(lgbcBO.res)['target'])]['params']

print(optimal_params)

In [None]:
gbm = lgb.LGBMClassifier(**{'bagging_fraction': 0.8783378533433686, 
                            'colsample_bytree': 0.6728458365247224, 
                            'learning_rate': 0.7991630583627533, 
                            'max_depth': 29, 
                            'min_child_weight': 24.799522961470622, 
                            'min_split_gain': 0.04761293422236514, 
                            'n_estimators': 64, 
                            'reg_alpha': 0.7738117598769145})

gbm.fit(X_train, y_train.reshape(len(y_train),), verbose=True)

print('Training set score: {:.4f}'.format(gbm.score(X_train, y_train)))
print('Test set score: {:.4f}\n'.format(gbm.score(X_test, y_test)))

In [None]:
plot_cm(confusion_matrix(y_test, y_pred));

### $\text{LSTM}$

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization

In [None]:
model = Sequential()
model.add(LSTM(64, input_shape=(5, 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(16))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(8, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

In [None]:
opt = tf.keras.optimizers.Adam(lr=0.002, decay=1e-5)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])

In [None]:
X_test.shape

In [None]:
EPOCHS = 100
BATCH_SIZE = 64

train_x = X_train.reshape(5058, 5, 1)
train_y = y_train

history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS)

In [None]:
score = model.evaluate(X_test.reshape(722, 5, 1), y_test, verbose=True)
print('Test loss:', score[0])
print('Test accuracy:', score[1])