In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, random_split

# 1. Load & sort your data
df = pd.read_csv('../data/master_dataset_cleaned.csv', parse_dates=['date'])
df.sort_values('date', inplace=True)
df.set_index('date', inplace=True)

# 2. Specify the sentiment columns and bootstrap-missing with column means
sent_cols = ['neg', 'neu', 'pos', 'compound']
col_means = df[sent_cols].mean()
df_init = df[sent_cols].fillna(col_means)

# 3. Scale into [0,1] for the LSTM
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(df_init)

# 4. Build sequences (look_back days → predict next day)
def create_sequences(arr, look_back=7):
    X, y = [], []
    for i in range(look_back, len(arr)):
        X.append(arr[i-look_back:i])
        y.append(arr[i])
    return np.array(X), np.array(y)

look_back = 7
X, y = create_sequences(data_scaled, look_back)

# 5. Train/test split
dataset = TensorDataset(torch.tensor(X, dtype=torch.float32),
                        torch.tensor(y, dtype=torch.float32))
train_size = int(len(dataset) * 0.8)
train_ds, test_ds = random_split(dataset, [train_size, len(dataset)-train_size])
train_loader = DataLoader(train_ds, batch_size=16, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=16)

# 6. Define your LSTM imputer
class ImputerLSTM(nn.Module):
    def __init__(self, n_features, hidden_size=32):
        super().__init__()
        self.lstm = nn.LSTM(n_features, hidden_size, batch_first=True)
        self.out  = nn.Linear(hidden_size, n_features)
    def forward(self, x):
        # x: [batch, seq_len, features]
        _, (h_n, _) = self.lstm(x)
        return self.out(h_n[-1])  # use last layer's final hidden

model = ImputerLSTM(n_features=len(sent_cols), hidden_size=32)

# 7. Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()
best_val = float('inf')
patience = 5
wait = 0
best_state = None

# 8. Train with early stopping
for epoch in range(1, 51):
    model.train()
    for xb, yb in train_loader:
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()

    # validation
    model.eval()
    with torch.no_grad():
        val_losses = [criterion(model(xb), yb).item()
                      for xb, yb in test_loader]
    val_loss = np.mean(val_losses)

    if val_loss < best_val:
        best_val = val_loss
        best_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Stopping early at epoch {epoch}")
            break

# restore best
model.load_state_dict(best_state)
model.eval()

# 9. Impute missing rows
missing_dates = df[df[sent_cols].isnull().any(axis=1)].index
all_scaled = torch.tensor(data_scaled, dtype=torch.float32)

for dt in missing_dates:
    idx = df.index.get_loc(dt)
    if idx >= look_back:
        seq = all_scaled[idx-look_back:idx].unsqueeze(0)  # [1, look_back, features]
        with torch.no_grad():
            pred_scaled = model(seq).numpy()
        df.loc[dt, sent_cols] = scaler.inverse_transform(pred_scaled)[0]

# 10. Check results
print("Remaining NaNs per column:\n", df[sent_cols].isna().sum())
print("\nFirst few imputed rows:")
print(df.loc[missing_dates[:5], sent_cols])

df.to_csv('../data/master_dataset_cleaned.csv', index=True)



Stopping early at epoch 7
Remaining NaNs per column:
 neg         0
neu         0
pos         0
compound    0
dtype: int64

First few imputed rows:
                 neg       neu       pos  compound
date                                              
2021-03-23  0.039350  0.843292  0.109383  0.156574
2021-03-24  0.039540  0.841961  0.109816  0.152279
2021-03-26  0.043022  0.833416  0.104883  0.104021
2021-03-31  0.042791  0.818499  0.099918  0.067160
2021-04-01  0.039064  0.820712  0.104089  0.091913
