In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

file_path = '/path/to/ohlcv_data.csv'  # Update this to the actual path
df = pd.read_csv(file_path)

# Load your CSV file for sentiment data
sentiment_file_path = '/path/to/sentiment_data.csv'  # Update this to the actual path
sentiment_df = pd.read_csv(sentiment_file_path)

# Merge OHLCV and sentiment data on a common key (e.g., date)
df = pd.merge(df, sentiment_df, on='date')

# Define the features and target (closing price)
features = ['open_x', 'high_x', 'low_x', 'volumefrom_x', 'volumeto_x', 'sentiment_positive', 'sentiment_negative', 'sentiment_neutral']
target = ['close_x']

# Preprocess the data
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df[features + target])

# Define a custom dataset for time series prediction
class StockDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length
        
    def __len__(self):
        return len(self.data) - self.seq_length
    
    def __getitem__(self, idx):
        seq_x = self.data[idx:idx + self.seq_length, :-1]  # All features except the target (closing price)
        seq_y = self.data[idx + self.seq_length, -1]  # The target (closing price)
        return torch.FloatTensor(seq_x), torch.FloatTensor([seq_y])

seq_length = 10  

# Use all rows for training
train_data = df_scaled  
dataset = StockDataset(train_data, seq_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the Transformer model
class StockTransformer(nn.Module):
    def __init__(self, input_size, d_model, nhead, num_layers, seq_length):
        super(StockTransformer, self).__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.pos_encoder = nn.Parameter(torch.zeros(seq_length, 1, d_model))  
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers)
        self.fc_out = nn.Linear(d_model, 1)
    
    def forward(self, x):
        # x shape: 
        x = self.embedding(x) 
        x = x.permute(1, 0, 2)  
        x = x + self.pos_encoder 
        x = self.transformer(x, x)
        out = self.fc_out(x[-1, :, :])  
        return out

# Initialize model, loss function, and optimizer
input_size = len(features)  # Number of input features (OHLCV + sentiments)
d_model = 64
nhead = 8
num_layers = 4

model = StockTransformer(input_size, d_model, nhead, num_layers, seq_length)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Function to train the model
def train_model(model, dataloader, epochs):
    model.train()
    train_losses = []

    for epoch in range(epochs):
        epoch_loss = 0
        
        for batch_x, batch_y in dataloader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= len(dataloader)
        train_losses.append(epoch_loss)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}')

    return train_losses

epochs = 30
train_losses = train_model(model, dataloader, epochs)


def predict_next_10_days(model, last_sequence, future_sentiment_data, n_predictions=10):
    model.eval()
    predictions = []
    current_sequence = last_sequence.copy()
    
    with torch.no_grad():
        for i in range(n_predictions):
             
            current_sequence[:, -3:] = future_sentiment_data[i]  
            
            current_sequence_tensor = torch.FloatTensor(current_sequence).unsqueeze(0)
            
        
            predicted_price = model(current_sequence_tensor).squeeze().numpy()
            predictions.append(predicted_price)
            
            new_day = np.zeros(current_sequence.shape[1])
            new_day[-1] = predicted_price  
            current_sequence = np.vstack([current_sequence[1:], new_day])
    
    return np.array(predictions)

last_10_days_data = df_scaled[-seq_length:, :-1]  


future_sentiment_data = sentiment_df[-10:][['sentiment_positive', 'sentiment_negative', 'sentiment_neutral']].values


predicted_prices = predict_next_10_days(model, last_10_days_data, future_sentiment_data, n_predictions=10)


dummy_array = np.zeros((predicted_prices.shape[0], len(features) + 1))  
dummy_array[:, -1] = predicted_prices  

predicted_prices_rescaled = scaler.inverse_transform(dummy_array)[:, -1]  # Get the closing prices only

predicted_df = pd.DataFrame({'day': np.arange(1, 11), 'predicted_closing_price': predicted_prices_rescaled})
predicted_df.to_csv('predicted_prices.csv', index=False)

plt.figure(figsize=(10, 6))
days = np.arange(len(df_scaled) + 1, len(df_scaled) + 11)
plt.plot(days, predicted_prices_rescaled, label='Predicted')
plt.xlabel('Days')
plt.ylabel('Closing Price')
plt.title('Predicted Stock Prices for Next 10 Days')
plt.legend()
plt.show()

print(predicted_df)
