In [11]:
import numpy as np
import pandas as pd
import tabulate
file_path = "/Data/MI_PHYSICS/Double-Pendulum-Simulation/test1/pendulum_data3600000.npz"
print("loading data...")
loaded_data = np.load(file_path)
print("reading data...")

# df = pd.DataFrame({key: loaded_data[key] for key in loaded_data.keys()})
# # Printing the first few rows of the DataFrame
# print(df.head())

# # Displaying the DataFrame's info
# df.info()


loading data...
reading data...


In [12]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import multiprocessing as mp

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load NPZ dataset
print("Loading NPZ dataset...")
npz_data = np.load(file_path)

# Display available keys in NPZ file
print("Available keys in NPZ file:")
for key in npz_data.files:
    print(f" - {key}: {npz_data[key].shape}")

Using device: cuda
Loading NPZ dataset...
Available keys in NPZ file:
 - Time: (3600000001,)
 - Theta1: (3600000001,)
 - Theta2: (3600000001,)
 - X1: (3600000001,)
 - Y1: (3600000001,)
 - X2: (3600000001,)
 - Y2: (3600000001,)
 - Energy: (3600000001,)


In [13]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

def process_chunk(key_array_tuple):
    key, array = key_array_tuple
    
    # Handle empty or 1D arrays
    if array.ndim == 1:
        array = array.reshape(-1, 1)  # Convert (N,) to (N,1)
    elif array.size == 0:
        return pd.DataFrame()  # Return an empty DataFrame

    return pd.DataFrame(array, columns=[f"{key}_{i}" for i in range(array.shape[1])])

def npz_to_dataframe_parallel(npz_file_path):
    data = np.load(npz_file_path)
    num_cores = cpu_count()
    
    print(f"Processing {len(data.files)} arrays with {num_cores} CPU cores...")
    
    with Pool(num_cores) as pool:
        results = list(tqdm(pool.imap(process_chunk, data.items()), total=len(data), desc="Converting NPZ to DataFrame"))
    
    # Filter out any empty DataFrames
    valid_dataframes = [df for df in results if not df.empty]
    
    if not valid_dataframes:
        raise ValueError("All arrays resulted in empty DataFrames. Check your NPZ file structure.")
    
    df = pd.concat(valid_dataframes, axis=1)
    print("Data successfully loaded into DataFrame.")
    return df

# Debugging: Check the contents of the NPZ file
data = np.load(file_path)

# for key in data.files:
#     print(f"{key}: shape {data[key].shape}")

# Load NPZ file into DataFrame
df = npz_to_dataframe_parallel(file_path)


Time: shape (3600000001,)
Theta1: shape (3600000001,)
Theta2: shape (3600000001,)
X1: shape (3600000001,)
Y1: shape (3600000001,)
X2: shape (3600000001,)
Y2: shape (3600000001,)
Energy: shape (3600000001,)
Processing 8 arrays with 48 CPU cores...


Converting NPZ to DataFrame: 100%|██████████| 8/8 [32:08<00:00, 241.00s/it]


Data successfully loaded into DataFrame.


In [None]:

# Normalize Data (excluding Time)
print("Normalizing data...")
scaler = MinMaxScaler()
df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])

# Define sequence length
SEQ_LEN = 100  # Number of timesteps to look back

# Function to create sequences
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

def process_sequence(args):
    """Extracts a sequence and target value."""
    data, target_col, seq_length, i = args
    return (data.iloc[i:i + seq_length, 1:].values, data.iloc[i + seq_length][target_col])

def create_sequences(data, target_col, seq_length):
    print("Creating sequences using multiprocessing...")

    # Prepare arguments for multiprocessing
    args_list = [(data, target_col, seq_length, i) for i in range(len(data) - seq_length)]

    # Use multiprocessing Pool to parallelize sequence creation
    with Pool(processes=cpu_count()) as pool:
        results = list(tqdm(pool.imap(process_sequence, args_list), total=len(args_list)))

    X, y = zip(*results)
    return np.array(X), np.array(y)


# Prepare sequences
X, y = create_sequences(df, "Energy", SEQ_LEN)

# Convert to PyTorch tensors and move to GPU
print("Splitting dataset into train/test...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print("Converting data to PyTorch tensors...")
X_train, X_test = torch.tensor(X_train, dtype=torch.float32).to(device), torch.tensor(X_test, dtype=torch.float32).to(device)
y_train, y_test = torch.tensor(y_train, dtype=torch.float32).to(device), torch.tensor(y_test, dtype=torch.float32).to(device)

# Reshape target tensor
y_train, y_test = y_train.view(-1, 1), y_test.view(-1, 1)

# Define LSTM Model using PyTorch
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True).to(device)
        self.fc = nn.Linear(hidden_dim, output_dim).to(device)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Take last time step output
        return out

# Hyperparameters
input_dim = X_train.shape[2]  # Number of features
hidden_dim = 50
num_layers = 2
output_dim = 1
batch_size = 64
epochs = 10
learning_rate = 0.001

# Initialize Model
print("Initializing LSTM model...")
model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim).to(device)

# Define Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Data Loader for Batch Training
train_data = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=False)

# Training Loop
print("Starting training...")
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
    
    for batch_X, batch_y in progress_bar:
        optimizer.zero_grad()
        y_pred = model(batch_X)
        loss = criterion(y_pred, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {epoch_loss/len(train_loader):.6f}")

# Evaluation
print("Evaluating model...")
model.eval()
with torch.no_grad():
    y_pred = model(X_test)

# Compute Test Loss
test_loss = criterion(y_pred, y_test).item()
print(f"Test Loss: {test_loss:.6f}")

# Move predictions back to CPU and convert to NumPy
y_pred = y_pred.cpu().numpy()
y_test = y_test.cpu().numpy()
print("Prediction complete.")


Normalizing data...
Creating sequences using multiprocessing...


In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import multiprocessing as mp

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load NPZ dataset
print("Loading NPZ dataset...")
npz_data = np.load(file_path)

# Display available keys in NPZ file
print("Available keys in NPZ file:")
for key in npz_data.files:
    print(f" - {key}: {npz_data[key].shape}")

# Convert to Pandas DataFrame
print("Converting NPZ data to DataFrame...")
df = pd.DataFrame({key: npz_data[key] for key in npz_data.keys()})

# Normalize Data (excluding Time)
print("Normalizing data...")
scaler = MinMaxScaler()
df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])

# Define sequence length
SEQ_LEN = 100  # Number of timesteps to look back

# Function to create sequences
def create_sequences(data, target_col, seq_length):
    X, y = [], []
    print("Creating sequences...")
    for i in tqdm(range(len(data) - seq_length)):
        X.append(data.iloc[i:i + seq_length, 1:].values)  # Use all columns except 'Time'
        y.append(data.iloc[i + seq_length][target_col])  # Predicting target column
    return np.array(X), np.array(y)

# Prepare sequences
X, y = create_sequences(df, "Energy", SEQ_LEN)

# Convert to PyTorch tensors and move to GPU
print("Splitting dataset into train/test...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print("Converting data to PyTorch tensors...")
X_train, X_test = torch.tensor(X_train, dtype=torch.float32).to(device), torch.tensor(X_test, dtype=torch.float32).to(device)
y_train, y_test = torch.tensor(y_train, dtype=torch.float32).to(device), torch.tensor(y_test, dtype=torch.float32).to(device)

# Reshape target tensor
y_train, y_test = y_train.view(-1, 1), y_test.view(-1, 1)

# Define LSTM Model using PyTorch
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True).to(device)
        self.fc = nn.Linear(hidden_dim, output_dim).to(device)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Take last time step output
        return out

# Hyperparameters
input_dim = X_train.shape[2]  # Number of features
hidden_dim = 50
num_layers = 2
output_dim = 1
batch_size = 64
epochs = 10
learning_rate = 0.001

# Initialize Model
print("Initializing LSTM model...")
model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim).to(device)

# Define Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Data Loader for Batch Training
train_data = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=False)

# Training Loop
print("Starting training...")
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
    
    for batch_X, batch_y in progress_bar:
        optimizer.zero_grad()
        y_pred = model(batch_X)
        loss = criterion(y_pred, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {epoch_loss/len(train_loader):.6f}")

# Evaluation
print("Evaluating model...")
model.eval()
with torch.no_grad():
    y_pred = model(X_test)

# Compute Test Loss
test_loss = criterion(y_pred, y_test).item()
print(f"Test Loss: {test_loss:.6f}")

# Move predictions back to CPU and convert to NumPy
y_pred = y_pred.cpu().numpy()
y_test = y_test.cpu().numpy()
print("Prediction complete.")


Using device: cuda
Loading NPZ dataset...
Available keys in NPZ file:
 - Time: (3600000001,)
 - Theta1: (3600000001,)
 - Theta2: (3600000001,)
 - X1: (3600000001,)
 - Y1: (3600000001,)
 - X2: (3600000001,)
 - Y2: (3600000001,)
 - Energy: (3600000001,)
Converting NPZ data to DataFrame...
Normalizing data...
Creating sequences...


  1%|          | 37652593/3599999901 [59:45<94:14:26, 10500.13it/s]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

# Ensure y_test and y_pred are NumPy arrays
y_test_np = np.array(y_test)  # No need for .cpu()
y_pred_np = np.array(y_pred)  # No need for .cpu()

# Create a time axis for visualization
time_axis = np.arange(len(y_test_np))

# Plot actual vs. predicted values
plt.figure(figsize=(12, 6))
plt.plot(time_axis, y_test_np, label="Actual Energy", color="blue", linewidth=2)
plt.plot(time_axis, y_pred_np, label="Predicted Energy", color="red", linestyle="dashed", linewidth=2)
plt.xlabel("Time Step")
plt.ylabel("Energy")
plt.title("LSTM Model Predictions vs. Actual Energy Values")
plt.legend()
plt.show()


plt.figure(figsize=(10, 5))
plt.plot(time_axis, y_test_np - y_pred_np, label="Residuals (Actual - Predicted)", color="purple")
plt.axhline(0, color="black", linestyle="dashed")
plt.xlabel("Time Step")
plt.ylabel("Prediction Error")
plt.title("Prediction Residuals")
plt.legend()
plt.show()


window_size = 50
rolling_actual = pd.Series(y_test_np.flatten()).rolling(window=window_size).mean()
rolling_pred = pd.Series(y_pred_np.flatten()).rolling(window=window_size).mean()

plt.figure(figsize=(12, 6))
plt.plot(time_axis, rolling_actual, label="Rolling Mean Actual", color="blue")
plt.plot(time_axis, rolling_pred, label="Rolling Mean Predicted", color="red", linestyle="dashed")
plt.xlabel("Time Step")
plt.ylabel("Energy")
plt.title("Smoothed LSTM Predictions vs. Actual")
plt.legend()
plt.show()

