In [None]:
from tkinter import *
from tkinter import filedialog

# A function to allow the user to select the folder contianing the data.
# Function inputs args: None. 
# Function output 1: The path of that the folder selected by the user. 
def folder_selection_dialog():
    root = Tk()
    root.title('Please select the directory containing the .xlsx files')
    root.filename = filedialog.askdirectory(initialdir="/", title="Select A Folder")
    directory = root.filename
    root.destroy()

    return directory

In [174]:
import os 
import pandas as pd 
import numpy as np 
import re

# A function to extract and condense the relevant data. 
# Function input arg 1: directory (string) --> The directory to the folder containing the .xlsx data.
# Funciton output 1: df --> The pandas dataframe containing the training information.
def get_red_waves(directory):
    
    # Get a list of the .xlsx files. 
    files = [_ for _ in os.listdir(directory) if _.endswith('.xlsx')]

    # Create a list to hold the data. 
    df = []

    # Loop through the individual .xlsx files and extract the 'red' information. 
    for t in range(len(files)):

        # Construct the file path. 
        file_t = os.path.join(directory, files[t])
        
        # Load in the .xlsx data. 
        data = pd.read_excel(file_t, index_col=None)
        
        # Delete all rows which are not of an even timepoint (hours). This is to standardize the time interval.
        indices = 1 - data.iloc[:,2] % 1
        data = data[indices == 1]
        
        # Append the red data, the file name and the binary wave value. 
        data = data.iloc[:,6]
        data = data.values.tolist()
        data.insert(0, files[t])
        
        wave_value = re.search('_(\d)\.', files[t]).group(1)
        data.insert(0, wave_value)
        
        # Aoppend the list of information to our array as a new row. 
        df.append(data)
    
    # Convert our array to a pandas dataframe. 
    df = pd.DataFrame(df)
    df = df.fillna(0)
    
    return df 

SyntaxError: invalid syntax (<ipython-input-174-462de9d454af>, line 34)

In [262]:
import torch
import torch.nn as nn 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

# A function to train an LSTM model to recognise repetitive sequence data as having waves or not. 
# Function inputs arg 1: df --> pandas dataframe as provided by the 'get_red_waevs()' function. 
# Function inputs arg 2: save_plot --> True or False. When True, saves plot to the img folder of the package. 
# Function inputs arg 3: display_plot --> True or False. When True, displays plot within conole. 
# Function output 1: 
def LSTM_model(df,
              save_plot = True,
              display_plot = True):

    ##### (1) Load and prepare the data. 
    y = df.iloc[:, 0]
    x = df.iloc[:, 2:len(df.columns)]
    
    # Get the number of timesteps (includign padded data)
    _, num_timesteps = df.shape
    
    # Split the data into testing and training data. 
    x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size = 0.33) # Use random_state = 1234 to create the same split for unit testing. 
    
    # Scale the data by 'removign the mean and scaling to unit variance'.
    x_training = StandardScaler().fit_transform(x_training)
    x_testing = StandardScaler().fit_transform(x_testing)
    
    # Convert the data to tensors. 
    x_training = torch.from_numpy(x_training.astype(np.float32))
    x_testing = torch.from_numpy(x_testing.astype(np.float32))
    
    y_training = np.array(y_training)
    y_training = torch.from_numpy(y_training.astype(np.float32))
    y_testing = np.array(y_testing)
    y_testing = torch.from_numpy(y_testing.astype(np.float32))
    
    ##### (2) Create the LSTM model. 
    
    class LSTM_model(nn.Module):
    
        # Constructor: 
        def __init__(self, input_size, hidden_size, num_layers=2):
            super(LSTM_model, self).__init__()
            self.input_size = input_size # Normally the number of expected features in the input 'x'. Here, it's the maximum length of the time sequences. 
            self.hidden_size = hidden_size # The number of features in the hidden state 'h'. 
            self.num_layers = num_layers # The number of LSTM cells which will be stacked together. 
            self.lstm = nn.LSTM(input_size, 
                                hidden_size, 
                                num_layers, 
                                dropout=0.5,
                                batch_first=True) # This is the LSTM layer. 
            self.linear = nn.Linear(in_features=hidden_size, out_features=1) # This layer will provide our output value. 
            self.sigmoid = nn.Sigmoid() # This is to binarize our outputs. 

        # This method will reset the state of the LSTM. 
        def reset_hidden_state(self): 
            self.hidden = (
            torch.zeros(self.num_layers, self.seq_length, self.hidden_dim),
            torch.zeros(self.num_layers, self.seq_length, self.hidden_dim))
              
        # Define the forward pass. 
        def forward(self, input): 
            h0 = torch.zeros(self.num_layers, self.seq_length, self.hidden_dim)
            c0 = torch.zeros(self.num_layers, self.seq_length, self.hidden_dim)
            lstm_out, = self.lstm(input, (h0, c0))
            y_pred = self.sigmoid(self.linear(lstm_out))
            return y_pred     
    
    #### (3) Create an instance of the model. 
    
    input_size = x_training.shape[1]
    hidden_size = input_size * 2
    model = LSTM_model(input_size, hidden_size)
    
    #### (4) Loss and optimiser. 
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam