In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import glob

# Define the features and target columns
features = ['time']
# , 'x0', 'y0', 'z0', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2', 'x3', 'y3', 'z3', 
            # 'amp0', 'phase0', 'amp1', 'phase1', 'amp2', 'phase2', 'amp3', 'phase3'

# Define dropping_status columns
dropping_status_columns = ['drop_0', 'drop_1', 'drop_2', 'drop_3']

# Define the combined features list
all_features = features + dropping_status_columns

# File pattern to match different shots (feature files)
feature_file_pattern = 'C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813/4a-240702-gspat-crossing-0.50v-1*_pre_process_df2.csv'
feature_files = glob.glob(feature_file_pattern)

# Corresponding label file pattern
label_file_pattern = 'C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813/4a-240702-gspat-crossing-0.50v-1*_pre_process_df_labels.csv'
label_files = glob.glob(label_file_pattern)

# Initialize the scaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Function to preprocess and create sequences for each shot
def preprocess_and_create_sequences(feature_file, label_file, time_steps=10):
    print(f"Processing: {feature_file}")
    
    # Read the feature data
    data = pd.read_csv(feature_file)
    
    # Read the corresponding label data
    labels = pd.read_csv(label_file)
    
    # Combine feature data with dropping status columns
    data_combined = pd.concat([data, labels[dropping_status_columns]], axis=1)

    # Handle missing values after combining the data
    data_combined.interpolate(method='linear', axis=0, inplace=True)
    data_combined.fillna(method='bfill', inplace=True)
    data_combined.fillna(method='ffill', inplace=True)

    # Scale the features including the dropping status columns
    scaled_data = scaler.fit_transform(data_combined[all_features])

    # Create sequences for time series data
    X, y = create_dataset(scaled_data, time_steps)
    return X, y, data_combined

# Convert the data to a supervised learning problem
def create_dataset(data, time_steps=1):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps), :])
        y.append(data[i + time_steps, :])  # Predicting the next time step
    return np.array(X), np.array(y)

# Define the number of time steps
time_steps = 10

# Prepare lists to hold model inputs and outputs
X_train_list, y_train_list = [], []
X_test_list, y_test_list = [], []

# Split files into training and testing sets (80% training, 20% testing)
train_files = feature_files[:int(len(feature_files) * 0.8)]
test_files = feature_files[int(len(feature_files) * 0.8):]
train_label_files = label_files[:int(len(label_files) * 0.8)]
test_label_files = label_files[int(len(label_files) * 0.8):]

# Process each training file separately
for feature_file, label_file in zip(train_files, train_label_files):
    X, y, _ = preprocess_and_create_sequences(feature_file, label_file, time_steps)
    X_train_list.append(X)
    y_train_list.append(y)

# Process each testing file separately
for feature_file, label_file in zip(test_files, test_label_files):
    X, y, original_data_combined = preprocess_and_create_sequences(feature_file, label_file, time_steps)
    X_test_list.append(X)
    y_test_list.append(y)

# Train the model on each shot separately.
for i, (X_train, y_train) in enumerate(zip(X_train_list, y_train_list)):
    print(f"Training on shot {i + 1}/{len(X_train_list)}")
    
    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(time_steps, len(all_features))))
    model.add(LSTM(50))
    model.add(Dense(len(all_features)))  # Output layer predicting the same number of features

    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model on the current shot's data
    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Predict back the data with dropping_status_columns set to 0
for i, (X_test, y_test) in enumerate(zip(X_test_list, y_test_list)):
    print(f"Predicting back on shot {i + 1}/{len(X_test_list)}")
    
    # Set all dropping status columns to 0
    X_test[:, :, -len(dropping_status_columns):] = 0

    # Predict sequentially for each time step
    predicted_sequences = []
    current_input = X_test[0]  # Start with the first time step

    for _ in range(len(X_test)):
        predicted_output = model.predict(current_input[np.newaxis, :, :])
        predicted_sequences.append(predicted_output[0])

        # Shift the window by one time step and add the predicted values to the end
        current_input = np.roll(current_input, -1, axis=0)
        current_input[-1] = predicted_output[0]

    predicted_sequences = np.array(predicted_sequences)

    # Check the shape of the predicted_sequences
    print(f"Shape of predicted_sequences: {predicted_sequences.shape}")

    # Ensure that the shape matches the expected output before inverse transforming
    if predicted_sequences.shape[-1] != len(all_features):
        raise ValueError(f"Unexpected shape for predicted sequences: {predicted_sequences.shape}")

    # Inverse transform the predictions to the original scale
    scaled_predictions = np.concatenate((predicted_sequences, np.zeros((predicted_sequences.shape[0], len(all_features) - predicted_sequences.shape[-1]))), axis=1)
    predictions = scaler.inverse_transform(scaled_predictions)[:, :len(all_features)]

    # Save predictions to CSV
    output_df = pd.DataFrame(predictions, columns=all_features)
    
    # Separate out the original feature columns
    output_df_1 = output_df[features]

    # Apply the clipping operation on the dropping status columns
    output_df_2 = output_df[dropping_status_columns].applymap(lambda x: 1 if abs(x) > 0.5 else 0)
    
    output_df_1.to_csv(f'back_predictions_shot_df_{i + 1}.csv', index=False)
    output_df_2.to_csv(f'back_predictions_shot_df2_{i + 1}.csv', index=False)

    print(f"Back prediction for shot {i + 1} completed.")



Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_001_c_pre_process_df2.csv
Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_002_c_pre_process_df2.csv
Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_003_c_pre_process_df2.csv
Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_004_c_pre_process_df2.csv
Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_005_c_pre_process_df2.csv
Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_006_c_pre_process_df2.csv
Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_007_c_pre_process_df2.csv
Processing: C:/Users/weicheng/Desktop/formal_dataset/p4_x_r1_240813\4a-240702-gspat-crossing-0.50v-1_008_c_pre

  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace=True)
  data_combined.fillna(method='ffill', inplace=True)
  data_combined.fillna(method='bfill', inplace



Epoch 1/100

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch

  output_df_2 = output_df[dropping_status_columns].applymap(lambda x: 1 if abs(x) > 0.5 else 0)


Shape of predicted_sequences: (33, 5)
Back prediction for shot 2 completed.
Predicting back on shot 3/5

  output_df_2 = output_df[dropping_status_columns].applymap(lambda x: 1 if abs(x) > 0.5 else 0)


Shape of predicted_sequences: (33, 5)
Back prediction for shot 3 completed.
Predicting back on shot 4/5

  output_df_2 = output_df[dropping_status_columns].applymap(lambda x: 1 if abs(x) > 0.5 else 0)


Shape of predicted_sequences: (33, 5)
Back prediction for shot 4 completed.
Predicting back on shot 5/5

  output_df_2 = output_df[dropping_status_columns].applymap(lambda x: 1 if abs(x) > 0.5 else 0)


Shape of predicted_sequences: (33, 5)
Back prediction for shot 5 completed.


  output_df_2 = output_df[dropping_status_columns].applymap(lambda x: 1 if abs(x) > 0.5 else 0)
