In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, r2_score
import hvplot.pandas

In [2]:

# Set the random seed for reproducibility
from numpy.random import seed

seed(1)
from tensorflow import random

random.set_seed(2)

In [3]:

# Read APPL.csv contains open, high, low, close, Adj close, Volume of Apple stock with twitter polarity scores and twitter volume
df = pd.read_csv('../data/AAPL.csv', index_col="Date", infer_datetime_format=True, parse_dates=True)

# Drop null values
df.dropna(inplace=True)

# pct change based on Adj close value
df["Pct_change"] = df["Adj Close"].pct_change()

# Drop null values
df.dropna(inplace=True)

df.head()


  df = pd.read_csv('../data/AAPL.csv', index_col="Date", infer_datetime_format=True, parse_dates=True)


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,ts_polarity,twitter_volume,Pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-01-05,26.44,26.46,25.6,25.68,23.83,223164000,0.133635,1430.0,-0.024959
2016-01-06,25.14,25.59,24.97,25.17,23.36,273829600,0.072042,1949.0,-0.019723
2016-01-07,24.67,25.03,24.11,24.11,22.38,324377600,0.074369,2289.0,-0.041952
2016-01-08,24.64,24.78,24.19,24.24,22.5,283192000,0.051595,2235.0,0.005362
2016-01-11,24.74,24.76,24.33,24.63,22.86,198957600,0.019443,1222.0,0.016


In [4]:

# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df[['Adj Close', 'ts_polarity', 'Volume', 'Pct_change']])

# Convert the scaled data back to a DataFrame for easier handling
scaled_df = pd.DataFrame(scaled_data, index=df.index, columns=['Adj Close', 'ts_polarity', 'Volume', 'Pct_change'])
scaled_df


Unnamed: 0_level_0,Adj Close,ts_polarity,Volume,Pct_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-05,0.074357,0.581540,0.364149,0.439185
2016-01-06,0.061069,0.398128,0.467966,0.469968
2016-01-07,0.033362,0.405057,0.571541,0.339284
2016-01-08,0.036754,0.337240,0.487150,0.617441
2016-01-11,0.046932,0.241498,0.314549,0.679982
...,...,...,...,...
2019-08-26,0.845915,0.399015,0.120333,0.698435
2019-08-27,0.829517,0.533615,0.118938,0.519217
2019-08-28,0.839129,0.366667,0.037512,0.625469
2019-08-29,0.863444,0.351727,0.078917,0.685288


In [15]:

# Function to create sequences
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Set sequence length
seq_length = 50

# Create sequences
X, y = create_sequences(scaled_df.values, seq_length)


In [16]:

# Split the data into training and testing sets (80% training, 20% testing)
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


In [17]:

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(seq_length, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train[:, 0], epochs=20, batch_size=32, validation_split=0.2)

# Make predictions
y_pred = model.predict(X_test)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:

# Inverse transform predictions and actual values
y_pred_inv = scaler.inverse_transform(np.concatenate((y_pred, np.zeros((y_pred.shape[0], scaled_df.shape[1]-1))), axis=1))[:, 0]
y_test_inv = scaler.inverse_transform(np.concatenate((y_test[:, 0].reshape(-1, 1), np.zeros((y_test.shape[0], scaled_df.shape[1]-1))), axis=1))[:, 0]

# Calculate evaluation metrics
mse = mean_squared_error(y_test_inv, y_pred_inv)
r2 = r2_score(y_test_inv, y_pred_inv)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Create a DataFrame for plotting
results_df = pd.DataFrame({
    'Date': df.index[-len(y_test):],
    'Actual': y_test_inv,
    'Predicted': y_pred_inv
})

# Plot the results
results_df.hvplot.line(x='Date', y=['Actual', 'Predicted'], title="Actual vs Predicted Stock Prices")


Mean Squared Error: 3.1223511641271906
R^2 Score: 0.8591551242433106


In [19]:
# Get the most recent data from scaled_df
recent_data = scaled_df[-seq_length:].values.reshape(1, seq_length, scaled_df.shape[1])


# Predict future values
future_predictions = model.predict(recent_data)

# Inverse transform predictions
future_predictions_inv = scaler.inverse_transform(np.concatenate((future_predictions, np.zeros((future_predictions.shape[0], scaled_df.shape[1]-1))), axis=1))[:, 0]

# Print the future predictions
print("Future Predictions:")
print(future_predictions_inv)



Future Predictions:
[50.35735857]


In [20]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

# Inverse transform predictions and actual values
y_pred_inv = scaler.inverse_transform(np.concatenate((y_pred, np.zeros((y_pred.shape[0], scaled_df.shape[1]-1))), axis=1))[:, 0]
y_test_inv = scaler.inverse_transform(np.concatenate((y_test[:, 0].reshape(-1, 1), np.zeros((y_test.shape[0], scaled_df.shape[1]-1))), axis=1))[:, 0]

# Calculate evaluation metrics
mse = mean_squared_error(y_test_inv, y_pred_inv)
r2 = r2_score(y_test_inv, y_pred_inv)
mae = mean_absolute_error(y_test_inv, y_pred_inv)
mape = mean_absolute_percentage_error(y_test_inv, y_pred_inv)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Absolute Percentage Error: {mape}")

# Calculate confidence intervals (simple approach using standard deviation)
confidence_interval = 1.96 * np.std(y_pred_inv - y_test_inv) / np.sqrt(len(y_pred_inv))

# Create a DataFrame for plotting
results_df = pd.DataFrame({
    'Date': df.index[-len(y_test):],
    'Actual': y_test_inv,
    'Predicted': y_pred_inv
})

# Plot the results with confidence intervals
fig = go.Figure()

# Add actual values trace
fig.add_trace(go.Scatter(x=results_df['Date'], y=results_df['Actual'], mode='lines', name='Actual'))

# Add predicted values trace
fig.add_trace(go.Scatter(x=results_df['Date'], y=results_df['Predicted'], mode='lines', name='Predicted'))

# Add upper and lower bounds for the confidence interval
fig.add_trace(go.Scatter(
    x=results_df['Date'], 
    y=results_df['Predicted'] + confidence_interval, 
    mode='lines', 
    name='Upper Bound', 
    line=dict(width=0),
    showlegend=False
))

fig.add_trace(go.Scatter(
    x=results_df['Date'], 
    y=results_df['Predicted'] - confidence_interval, 
    mode='lines', 
    name='Lower Bound', 
    fill='tonexty', 
    line=dict(width=0),
    fillcolor='rgba(0,100,80,0.2)',
    showlegend=False
))

# Customize the layout
fig.update_layout(
    title="Actual vs Predicted Stock Prices with Confidence Interval",
    xaxis_title="Date",
    yaxis_title="Price",
    legend_title="Legend",
    hovermode="x unified"
)

fig.show()

# Anomaly detection
results_df['Difference'] = np.abs(results_df['Actual'] - results_df['Predicted'])
anomalies = results_df[results_df['Difference'] > 2 * confidence_interval]

# Summary statistics
print("Summary Statistics:")
print(results_df.describe())

if not anomalies.empty:
    print("Anomalies detected:")
    print(anomalies)
else:
    print("No significant anomalies detected.")


Mean Squared Error: 3.1223511641271906
R^2 Score: 0.8591551242433106
Mean Absolute Error: 1.5313573751483598
Mean Absolute Percentage Error: 0.03324258400488401


Summary Statistics:
                                Date      Actual   Predicted  Difference
count                            175  175.000000  175.000000  175.000000
mean   2019-04-27 20:17:49.714285824   46.021543   45.149528    1.531357
min              2018-12-20 00:00:00   34.780000   37.791266    0.048815
25%              2019-02-25 12:00:00   42.575000   41.912089    0.762823
50%              2019-04-29 00:00:00   47.490000   46.283682    1.539844
75%              2019-06-29 12:00:00   49.975000   48.819345    2.142664
max              2019-08-30 00:00:00   52.640000   50.441405    4.328414
std                              NaN    4.721878    4.015391    0.884174
Anomalies detected:
          Date  Actual  Predicted  Difference
0   2018-12-20   38.36  41.351706    2.991706
1   2018-12-21   36.87  40.838532    3.968532
2   2018-12-24   35.92  40.064912    4.144912
3   2018-12-26   38.45  39.423835    0.973835
4   2018-12-27   38.20  39.213886    1.013886
..         ...     ...     