## Implementasi Metode VADER-LSTM dalam Pengujian Pengaruh Sentimen Investor terhadap Prediksi Harga Saham

In [1]:
import math
import nltk
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as dt
from matplotlib.dates import DateFormatter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from keras import layers
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ravie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
company = "TSLA"

### Importing Tweet Data

In [3]:
all_tweets = pd.read_csv("stock_tweets.csv")
print(all_tweets.shape)
all_tweets

(80793, 4)


Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."
...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.
80789,2021-10-04 17:05:59+00:00,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.
80790,2021-10-01 04:43:41+00:00,Our record delivery results are a testimony of...,XPEV,XPeng Inc.
80791,2021-10-01 00:03:32+00:00,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.


In [4]:
tweet_df = all_tweets[all_tweets['Stock Name'] == company]
tweet_df = tweet_df.drop(['Company Name', 'Stock Name'], axis=1)
tweet_df['Date'] = pd.to_datetime(tweet_df['Date'])
tweet_df['Date'] = tweet_df['Date'].dt.date
print(tweet_df.shape)
tweet_df.head()

(37422, 2)


Unnamed: 0,Date,Tweet
0,2022-09-29,Mainstream media has done an amazing job at br...
1,2022-09-29,Tesla delivery estimates are at around 364k fr...
2,2022-09-29,3/ Even if I include 63.0M unvested RSUs as of...
3,2022-09-29,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...
4,2022-09-29,"@RealDanODowd @Tesla Stop trying to kill kids,..."


##### Labeling without pre-processing

In [None]:
unprocessed_df = tweet_df.copy()
unprocessed_df["Compound"] = pd.Series(dtype='float64')
unprocessed_df.head()

In [None]:
sentiment_analyzer = SentimentIntensityAnalyzer()
for indx, row in unprocessed_df.T.items():
    try:
        sentence_i = unicodedata.normalize('NFKD', unprocessed_df.loc[indx, 'Tweet'])
        sentence_sentiment = sentiment_analyzer.polarity_scores(sentence_i)
        unprocessed_df.at[indx, 'Compound'] = sentence_sentiment['compound']
    except TypeError:
        print (unprocessed_df.loc[indx, 'Tweet'])
        print (indx)
        break

In [None]:
unprocessed_df.to_csv('tweet_unprocessed_labeled.csv', index=False)

In [None]:
unprocessed_df = pd.read_csv('tweet_unprocessed_labeled.csv')
unprocessed_df['Date'] = pd.to_datetime(unprocessed_df['Date'])
unprocessed_df['Date'] = unprocessed_df['Date'].dt.date
unprocessed_df

#### Alternate: Pre-process the tweets then labeling

In [5]:
processed_df = tweet_df.copy()
processed_df

Unnamed: 0,Date,Tweet
0,2022-09-29,Mainstream media has done an amazing job at br...
1,2022-09-29,Tesla delivery estimates are at around 364k fr...
2,2022-09-29,3/ Even if I include 63.0M unvested RSUs as of...
3,2022-09-29,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...
4,2022-09-29,"@RealDanODowd @Tesla Stop trying to kill kids,..."
...,...,...
37417,2021-09-30,Playing in the dirt and #chasingsunsets\n@tesl...
37418,2021-09-30,I agree with @freshjiva that $TSLA ‘s EV busin...
37419,2021-09-30,Hold. On. Tight. $TSLA
37420,2021-09-30,Get ready for a $TSLA _ _ _ _ _ _ Q3 delivery...


In [6]:
import string
import re
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def cleaning(text):
    text = text.replace('\\t', " ").replace('\\n', " ").replace('\\u', " ").replace('\\', "")
    text = text.encode('ascii', 'replace').decode('ascii')
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
    return text.replace("http://", " ".replace("https://", " "))

def removeStopword(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

def stemming(text):
    porter = PorterStemmer()
    return porter.stem(text)

In [8]:
def preprocess_tweet(tweet):
    '''
    Takes a tweet as an input and output the list of tokens.
    '''
    
    import emoji
    import re
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    
    # Initialization
    new_tweet = tweet
    
    ## Changes on string
    
    # Remove urls
    new_tweet = re.sub(r'https?://[^ ]+', '', new_tweet)
    
    # Remove usernames
    new_tweet = re.sub(r'@[^ ]+', '', new_tweet)
    
    # Remove hashtags
    new_tweet = re.sub(r'#', '', new_tweet)
    
    # Character normalization
    new_tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', new_tweet)
    
    # Emoji transformation
    new_tweet = emoji.demojize(new_tweet)
    
    # Punctuation and special characters
    new_tweet = re.sub(r' 0 ', 'zero', new_tweet)
    new_tweet = re.sub(r'[^A-Za-z ]', '', new_tweet)
    
    # Lower casing
    new_tweet = new_tweet.lower()
    
    
    ## Changes on tokens
    
    # Tokenization
    tokens = word_tokenize(new_tweet)
    
    porter = PorterStemmer()
    
    for token in tokens:
        # Stopwords removal
        if token in stopwords.words('english'):
            tokens.remove(token)
        # Stemming
            token = porter.stem(token)
    
    return ' '.join(tokens)

In [9]:
for indx, row in processed_df.T.items():
    try:
        processed_df.at[indx, 'Tweet'] = preprocess_tweet(processed_df.at[indx, 'Tweet'])
    except TypeError:
        break

In [15]:
sentiment_analyzer = SentimentIntensityAnalyzer()
for indx, row in processed_df.T.items():
    try:
        sentence_sentiment = sentiment_analyzer.polarity_scores(processed_df.loc[indx, 'Tweet'])
        processed_df.at[indx, 'Compound'] = sentence_sentiment['compound']
    except TypeError:
        print (processed_df.loc[indx, 'Tweet'])
        print (indx)
        break

AttributeError: 'float' object has no attribute 'encode'

In [11]:
processed_df.to_csv('tweet_processed_labeled.csv', index=False)

In [12]:
processed_df = pd.read_csv('tweet_processed_labeled.csv')
processed_df['Date'] = pd.to_datetime(processed_df['Date'])
processed_df['Date'] = processed_df['Date'].dt.date
processed_df

Unnamed: 0,Date,Tweet,Compound
0,2022-09-29,mainstream media done amazing job brainwashing...,0.0772
1,2022-09-29,tesla delivery estimates at around k the analy...,0.0000
2,2022-09-29,even i include unvested rsus of additional equ...,0.2960
3,2022-09-29,hahaha are still trying stop tesla fsd bro get...,-0.7096
4,2022-09-29,stop trying kill kids sad deranged old man,-0.8750
...,...,...,...
37417,2021-09-30,playing the dirt chasingsunsets,-0.1531
37418,2021-09-30,agree tsla ev business alone worth gt sh wo fs...,0.7003
37419,2021-09-30,hold tight tsla,0.0000
37420,2021-09-30,get ready a tsla q delivery numberhave ur answ...,0.3612


#### Comparing between unprocessed and pre-processed labeled data

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax.plot(processed_df.groupby([processed_df['Date']]).mean(numeric_only=True))
ax.plot(unprocessed_df.groupby([unprocessed_df['Date']]).mean(numeric_only=True))
ax.xaxis.set_major_locator(dt.MonthLocator())
ax.xaxis.set_major_formatter(DateFormatter("%Y-%m"))

#### Grouping sentiments by day

In [None]:
daily_sentiments_df = processed_df.groupby([processed_df['Date']]).mean(numeric_only=True)
daily_sentiments_df

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax.plot(daily_sentiments_df['Compound'], color='#008B8B')
ax.set(xlabel="Date", ylabel="USD", title=f"{company} Daily Sentiment")
ax.xaxis.set_major_locator(dt.MonthLocator())
ax.xaxis.set_major_formatter(DateFormatter("%Y-%m"))
plt.show()

### Importing Stock Data

In [None]:
all_stocks = pd.read_csv("stock_yfinance_data.csv")
all_stocks

In [None]:
stock_df = all_stocks[all_stocks['Stock Name'] == company]
stock_df = stock_df.drop('Stock Name', axis=1)
stock_df['Date'] = pd.to_datetime(stock_df['Date'])
stock_df['Date'] = stock_df['Date'].dt.date
stock_df = stock_df.set_index("Date")
print(stock_df.shape)
stock_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax.plot(stock_df['Close'], color='#008B8B')
ax.set(xlabel="Date", ylabel="USD", title=f"{company} Stock Price")
ax.xaxis.set_major_formatter(DateFormatter("%Y-%m"))
plt.show()

In [None]:
dataset_df = stock_df.copy()
dataset_df = dataset_df.join(daily_sentiments_df, how="left", on="Date")
print(dataset_df.shape)
dataset_df.head()

### Functions

In [None]:
def SplitData(data, train_size, timestep):
    training_data_len = math.ceil(len(data)* train_size)
    
    train_data = data[0: training_data_len, :]
    test_data = data[training_data_len-timestep: , : ]

    return train_data, test_data    

In [None]:
def SplitDataNew(data, train_size, timestep):
    data_values = data.values
    training_data_len = math.ceil(len(data)* train_size)

    scaler = MinMaxScaler(feature_range=(0,1))
    if (len(data_values.shape) == 1):
        scaled_data = scaler.fit_transform(data_values.reshape(-1,1))
    else:
        scaled_data = scaler.fit_transform(data_values)
        scaled_index = scaler.fit_transform(data_values[:, 0:1].flatten().reshape(-1,1))

    train_data = scaled_data[0: training_data_len, :]
    test_data = scaled_data[training_data_len-timestep: , :]

    train_data_x = train_data[0: training_data_len, :]
    train_data_y = train_data[0: training_data_len, 0:1]

    x_train = []
    y_train = []

    for i in range(timestep, len(train_data_x)):
        x_train.append(train_data_x[i-timestep:i])
        y_train.append(train_data_y[i][0])

    x_train, y_train = np.array(x_train), np.array(y_train)
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2]))

    test_data = scaled_data[training_data_len-timestep: , : ]
    x_test = []
    if (len(data_values.shape) == 1):
        y_test = data_values[training_data_len:]
    else:
        y_test = data_values[training_data_len: , 0]

    for i in range(timestep, len(test_data)):
        x_test.append(test_data[i-timestep:i])

    x_test = np.array(x_test)
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], x_test.shape[2]))

    return x_train, y_train, x_test, y_test, scaler

In [None]:
def TrainModel(x_data, y_data, epoch):

    model = keras.Sequential()
    model.add(layers.LSTM(100, return_sequences=True, input_shape=(x_data.shape[1], x_data.shape[2])))
    model.add(layers.LSTM(100, return_sequences=False))
    model.add(layers.Dense(25))
    model.add(layers.Dense(1))

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])
    history = model.fit(x_data, y_data, batch_size= 1, epochs=epoch, validation_split=0.2)

    return model, history

In [None]:
def PlotTrainingMetrics(history):
    history_data = history.history
    
    loss_values = history_data['loss']
    val_loss_values = history_data['val_loss']
    mae_values = history_data['mae']
    val_mae_values = history_data['val_mae']
    rmse_values = np.sqrt(history_data['loss'])
    val_rmse_values = np.sqrt(history_data['val_loss'])
    epochs = range(1, len(loss_values) + 1)

    fig = plt.figure(figsize=(16, 8))
    gs = fig.add_gridspec(1, 3, wspace=0)
    (ax1, ax2, ax3) = gs.subplots(sharey=True)
    fig.suptitle('Training and validation metrics')
    fig.supxlabel('epochs')

    ax1.plot(epochs, mae_values, color = 'blue', label='Training MAE')
    ax1.plot(epochs, val_mae_values, color='red', label='Validation MAE')
    ax1.set_title('MAE')
    ax1.set_xticks(epochs)
    ax1.xaxis.set_major_locator(ticker.MultipleLocator(len(loss_values)/5))
    ax1.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    ax1.xaxis.grid(True, which='both', alpha=0.5)
    ax1.yaxis.grid(True, alpha=0.5)
    ax1.set_ylabel('value')
    ax1.legend()

    ax2.plot(epochs, loss_values, color = 'blue', label='Training loss')
    ax2.plot(epochs, val_loss_values, color='red', label='Validation loss')
    ax2.set_title('Loss (MSE)')
    ax2.set_xticks(epochs)
    ax2.xaxis.set_major_locator(ticker.MultipleLocator(len(loss_values)/5))
    ax2.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    ax2.xaxis.grid(True, which='both', alpha=0.5)
    ax2.yaxis.grid(True, alpha=0.5)
    ax2.legend()

    ax3.plot(epochs, rmse_values, color = 'blue', label='Training RMSE')
    ax3.plot(epochs, val_rmse_values, color='red', label='Validation RMSE')
    ax3.set_title('RMSE')
    ax3.set_xticks(epochs)
    ax3.xaxis.set_major_locator(ticker.MultipleLocator(len(loss_values)/5))
    ax3.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    ax3.xaxis.grid(True, which='both', alpha=0.5)
    ax3.yaxis.grid(True, alpha=0.5)
    ax3.legend()

In [None]:
def PlotPredictions(data, prediction):
    train_plot = data[:len(data)-len(prediction)]
    validation_plot = data[len(data)-len(prediction):].copy()
    validation_plot['Predictions'] = prediction

    fig, ax = plt.subplots(figsize=(16, 8))
    ax.set_title('Model and Predictions')
    ax.set_ylabel('Closing price (USD)')
    ax.set_xlabel('Date')
    ax.plot(train_plot)
    ax.plot(validation_plot[['Close', 'Predictions']])
    ax.xaxis.set_major_locator(dt.MonthLocator())
    ax.xaxis.set_minor_locator(dt.MonthLocator(bymonthday=15))
    ax.xaxis.set_major_formatter(dt.DateFormatter('%b'))
    ax.grid(alpha=0.5, which='both')
    ax.legend(['Train', 'Validation', 'Predictions'])

### LSTM without Sentiment Score

In [None]:
close_prices = dataset_df['Close']
stock_values = close_prices.values

train_portion = 0.8
timestep = 60

scaler = MinMaxScaler(feature_range=(0,1))
stock_data_scaled = scaler.fit_transform(stock_values.reshape(-1,1))

train_lstm_only, test_lstm_only = SplitData(stock_data_scaled, train_portion, timestep)

In [None]:
x_train = []
y_train = []

for i in range(timestep, len(train_lstm_only)):
    x_train.append(train_lstm_only[i-timestep:i, 0])
    y_train.append(train_lstm_only[i, 0])

x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

In [None]:
x_test = []
y_test = stock_values[len(train_lstm_only):]

for i in range(timestep, len(test_lstm_only)):
  x_test.append(test_lstm_only[i-timestep:i, 0])

x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

In [None]:
singleModel, history = TrainModel(x_train, y_train, 10)

In [None]:
predictions = singleModel.predict(x_test)
predictions = scaler.inverse_transform(predictions)

mae = np.mean(np.abs(predictions - y_test))
mse = np.mean(predictions - y_test)**2
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_test - predictions)/y_test)) * 100
mae, mse, rmse

In [None]:
PlotTrainingMetrics(history)

In [None]:
PlotPredictions(dataset_df.filter(['Close']), predictions)

### LSTM with User Sentiment Score

In [None]:
combined_data = dataset_df[['Close', 'Compound']].copy()
combined_values = combined_data.values

close_prices = dataset_df['Close']
stock_values = close_prices.values

train_portion = 0.8
timestep = 60

training_data_len = math.ceil(len(combined_values)* train_portion)

scaler = MinMaxScaler(feature_range=(0,1))
combined_data_scaled_x = scaler.fit_transform(combined_values)
combined_data_scaled_y = scaler.fit_transform(stock_values.reshape(-1,1))

In [None]:
train_data_x = combined_data_scaled_x[0: training_data_len, :]
train_data_y = combined_data_scaled_y[0: training_data_len, :]

x_train = []
y_train = []

for i in range(timestep, len(train_data_x)):
    x_train.append(train_data_x[i-timestep:i])
    y_train.append(train_data_y[i][0])

x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], x_train.shape[2]))

In [None]:
test_data = combined_data_scaled_x[training_data_len-timestep: , : ]
x_test = []
y_test = stock_values[training_data_len:]

for i in range(timestep, len(test_data)):
  x_test.append(test_data[i-timestep:i])

x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], x_test.shape[2]))

In [None]:
sentimentModel, history_sentiment = TrainModel(x_train, y_train, 3)

In [None]:
predictions = sentimentModel.predict(x_test)

In [None]:
predictions = scaler.inverse_transform(predictions)

mae = np.mean(np.abs(predictions - y_test))
mse = np.mean(predictions - y_test)**2
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_test - predictions)/y_test)) * 100
mae, mse, rmse, mape

In [None]:
PlotTrainingMetrics(history_sentiment)

In [None]:
PlotPredictions(dataset_df.filter(['Close']), predictions)

### TESTING

In [None]:
close_prices = dataset_df['Close']

train_portion = 0.8
timestep = 60

a, b, c, d, s = SplitDataNew(close_prices, train_portion, timestep)

In [None]:
sentimentModel, history_sentiment = TrainModel(a, b, 3)

In [None]:
dictions = sentimentModel.predict(c)

dictions = s.inverse_transform(dictions)

mae = np.mean(np.abs(dictions - d))
mse = np.mean(dictions - d)**2
rmse = np.sqrt(mse)
mape = np.mean(np.abs((d - dictions)/d)) * 100
mae, mse, rmse, mape

In [None]:
PlotTrainingMetrics(history_sentiment)

In [None]:
PlotPredictions(dataset_df.filter(['Close']), dictions)