In [23]:
import numpy as np
import pandas as pd
import os

# Load each .npy file and convert the list of dictionaries to a DataFrame
dataframes = []
for file in os.listdir('dataset'):
    if file.endswith('.npy'):
        data = np.load('dataset/' + file, allow_pickle=True)
        dataframes.append(pd.DataFrame.from_records(data))

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)
merged_df

Unnamed: 0,Word,Frame,X00,Y00,Z00,X01,Y01,Z01,X02,Y02,...,Z62,X63,Y63,Z63,X64,Y64,Z64,X65,Y65,Z65
0,Boy,1,-0.290488,-75.818914,-3.229877,-0.166410,-77.328566,-3.061451,-0.098528,-82.594196,...,-0.349284,0.610715,0.973082,-0.303378,0.405078,0.922159,-0.281479,0.559808,0.851646,-0.028510
1,Boy,2,-2.630732,163.577346,-2.202535,-2.630732,163.577346,-2.202535,-2.630732,163.577346,...,-0.346683,0.613981,0.946346,-0.395646,0.413621,0.845886,-0.267950,0.558445,0.851427,-0.036818
2,Boy,3,-2.649066,62.903229,-2.865200,-2.649066,62.903229,-2.865200,-2.649066,62.903229,...,-0.820756,0.612683,0.950004,-0.379825,0.448817,0.581546,-0.748774,0.557367,0.867107,-0.040340
3,Boy,4,-2.641445,73.387394,-6.816939,-2.641445,73.387394,-6.816939,-2.641445,73.387394,...,-0.892051,0.607402,0.961182,-0.300694,0.439404,0.449596,-0.823084,0.556838,0.878659,-0.022312
4,Boy,5,0.075973,-2.649841,-1.005892,0.190630,-5.362860,-1.022167,0.243538,-8.214614,...,-0.829807,0.604378,0.950633,-0.247957,0.445261,0.344964,-0.765876,0.552638,0.877294,-0.045415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,You,16,-0.255172,-14.686667,2.176972,-0.137746,-15.407824,3.206680,-0.088495,-16.454638,...,-0.189617,0.678464,0.430918,-0.998512,0.307135,0.832112,-0.110380,0.559508,0.746648,0.006331
14136,You,17,-0.247349,-18.125843,0.387121,-0.123823,-19.070908,-0.996675,-0.073921,-20.329048,...,-0.235708,0.673772,0.673639,-0.173289,0.301638,0.830727,-0.135007,0.558916,0.736754,0.016739
14137,You,18,-0.252560,-28.690787,5.913587,-0.133411,-30.150542,5.626362,-0.083474,-32.168021,...,-0.643596,0.653561,0.818583,-0.402061,0.293410,0.830436,-0.519518,0.550947,0.735413,0.027168
14138,You,19,-0.263622,-44.649560,6.507309,-0.145083,-46.797549,6.096631,-0.096428,-49.825290,...,-0.573453,0.627514,0.836671,-0.288298,0.281208,0.831862,-0.455801,0.543025,0.740520,0.026633


In [24]:
max_sequence_length = merged_df['Frame'].max()
max_sequence_length

20

In [25]:
def pad_sequences(group):
    # Calculate the number of padding rows needed
    padding_rows = max_sequence_length - len(group)
    
    # Create a DataFrame with padding rows filled with NaN (or any other padding value)
    padding_df = pd.DataFrame({
        'Word': [group['Word'].iloc[0]] * padding_rows,
        'Frame': np.arange(len(group) + 1, max_sequence_length + 1),
    })
    
    # Concatenate the original group with the padding DataFrame
    return pd.concat([group, padding_df], ignore_index=True)

# Group the DataFrame by 'Word' and apply the padding function
padded_df = merged_df.groupby('Word').apply(pad_sequences).reset_index(drop=True)
padded_df

Unnamed: 0,Word,Frame,X00,Y00,Z00,X01,Y01,Z01,X02,Y02,...,Z62,X63,Y63,Z63,X64,Y64,Z64,X65,Y65,Z65
0,Boy,1,-0.290488,-75.818914,-3.229877,-0.166410,-77.328566,-3.061451,-0.098528,-82.594196,...,-0.349284,0.610715,0.973082,-0.303378,0.405078,0.922159,-0.281479,0.559808,0.851646,-0.028510
1,Boy,2,-2.630732,163.577346,-2.202535,-2.630732,163.577346,-2.202535,-2.630732,163.577346,...,-0.346683,0.613981,0.946346,-0.395646,0.413621,0.845886,-0.267950,0.558445,0.851427,-0.036818
2,Boy,3,-2.649066,62.903229,-2.865200,-2.649066,62.903229,-2.865200,-2.649066,62.903229,...,-0.820756,0.612683,0.950004,-0.379825,0.448817,0.581546,-0.748774,0.557367,0.867107,-0.040340
3,Boy,4,-2.641445,73.387394,-6.816939,-2.641445,73.387394,-6.816939,-2.641445,73.387394,...,-0.892051,0.607402,0.961182,-0.300694,0.439404,0.449596,-0.823084,0.556838,0.878659,-0.022312
4,Boy,5,0.075973,-2.649841,-1.005892,0.190630,-5.362860,-1.022167,0.243538,-8.214614,...,-0.829807,0.604378,0.950633,-0.247957,0.445261,0.344964,-0.765876,0.552638,0.877294,-0.045415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,You,16,-0.255172,-14.686667,2.176972,-0.137746,-15.407824,3.206680,-0.088495,-16.454638,...,-0.189617,0.678464,0.430918,-0.998512,0.307135,0.832112,-0.110380,0.559508,0.746648,0.006331
14136,You,17,-0.247349,-18.125843,0.387121,-0.123823,-19.070908,-0.996675,-0.073921,-20.329048,...,-0.235708,0.673772,0.673639,-0.173289,0.301638,0.830727,-0.135007,0.558916,0.736754,0.016739
14137,You,18,-0.252560,-28.690787,5.913587,-0.133411,-30.150542,5.626362,-0.083474,-32.168021,...,-0.643596,0.653561,0.818583,-0.402061,0.293410,0.830436,-0.519518,0.550947,0.735413,0.027168
14138,You,19,-0.263622,-44.649560,6.507309,-0.145083,-46.797549,6.096631,-0.096428,-49.825290,...,-0.573453,0.627514,0.836671,-0.288298,0.281208,0.831862,-0.455801,0.543025,0.740520,0.026633


In [26]:
from sklearn.preprocessing import MinMaxScaler

data = padded_df.drop('Word', axis=1)
target = padded_df['Word']

# Normalize data
scaler = MinMaxScaler(feature_range=(0, 1))

features_scaled = scaler.fit_transform(data)

import pickle
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

###Random Forest

In [27]:
# # Define the interval
# interval = 20

# # Initialize lists to hold the training and testing sets
# X_train_list = []
# y_train_list = []
# X_test_list = []
# y_test_list = []

# # Iterate over features_scaled with a step of 20
# for i in range(0, len(padded_df) - interval, 3*interval):
#     # Slice the dataset for the current interval
#     X_train = pd.DataFrame(features_scaled[i:i+2*interval])
#     y_train = pd.DataFrame(target[i:i+2*interval])

#     X_test = pd.DataFrame(features_scaled[i+2*interval:i+3*interval])
#     y_test = pd.DataFrame(target[i+2*interval:i+3*interval])

#     # Append the sliced DataFrames to the lists
#     X_train_list.append(X_train)
#     y_train_list.append(y_train)
#     X_test_list.append(X_test)
#     y_test_list.append(y_test)

# # Convert the lists to DataFrames
# X_train_df = pd.concat(X_train_list)
# y_train_df = pd.concat(y_train_list)
# X_test_df = pd.concat(X_test_list)
# y_test_df = pd.concat(y_test_list)

# X_train_df

In [28]:
# from sklearn.ensemble import RandomForestClassifier

# # Initialize the model
# clf = RandomForestClassifier(n_estimators=100, random_state=42)

# # Train the model
# clf.fit(X_train_df, y_train_df)

In [29]:
# from sklearn.metrics import classification_report

# # Make predictions on the test set
# y_pred = clf.predict(X_test_df)

# # Evaluate the model
# print(classification_report(y_test_df, y_pred))

In [30]:
# import pickle

# with open('rfmodel.pkl', 'wb') as file:
#     pickle.dump(clf, file)


LSTM

In [31]:
# Define the number of time steps
time_steps = 20

# Prepare the data
X = []
y = []

for i in range(0, len(features_scaled) - time_steps, time_steps):
    X.append(features_scaled[i:i + time_steps])
    y.append(target[i])

# Convert to NumPy arrays
X = np.array(X)
y = np.array(y)

# Reshape X to fit the LSTM input shape: [samples, time steps, features]
X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))
len(X)

706

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'y_train' is your target variable with categorical values
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
num_classes = len(encoder.classes_)

model = Sequential()
model.add(LSTM(50, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(num_classes, activation='softmax')) # num_classes is the number of unique words you're predicting

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [42]:
history = model.fit(X_train, y_train_encoded, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [43]:
test_loss, test_accuracy = model.evaluate(X_test, y_test_encoded)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


Test Loss: 0.5596828460693359, Test Accuracy: 0.8239436745643616


In [44]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test) # Replace `model` with your trained LSTM model and `X_test` with your test dataset
y_pred_labels = np.argmax(y_pred, axis=1)
# Print the classification report
print(classification_report(y_test_encoded, y_pred_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.76      0.87        17
           2       0.86      0.60      0.71        10
           3       0.83      0.83      0.83         6
           4       0.00      0.00      0.00        10
           5       1.00      1.00      1.00         9
           6       0.67      0.89      0.76         9
           7       1.00      1.00      1.00        14
           8       1.00      1.00      1.00         6
           9       0.43      0.60      0.50         5
          10       0.75      0.75      0.75         4
          11       0.20      0.67      0.31         3
          12       1.00      0.89      0.94         9
          13       1.00      1.00      1.00         3
          14       0.86      1.00      0.92         6
          15       1.00      1.00      1.00         9
          16       0.75      1.00      0.86        12

    accuracy              

In [38]:
# Save the model as pickle file
with open('lstmmodel.pkl', 'wb') as file:
    pickle.dump(model, file)