In [3]:
#imports!
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from scipy.stats import zscore
from google.colab import files

# Descaling function for Barrett's custom normalization
def descale_lat(scaled_lat):
    return (scaled_lat * 10) + 14

def descale_lon(scaled_lon):
    return (scaled_lon * 10) + 120

#Functions for anamoly detection and distance calculations
def calculate_distance(lat, lon):
    return np.sqrt(lat**2 + lon**2)

def classify_anomaly(z):
    if abs(z) >= 3:
        return 'High'
    if abs(z) >= 2:
        return 'Medium'
    return 'Low'

# Upload and process user's data file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)

# Redefining all points in avgLat and avgLon column to be the actual coordinate values
data['AvgLat'] = descale_lat(data['AvgLat'])
data['AvgLon'] = descale_lon(data['AvgLon'])

# Normalizing using MinMaxScaler
scaler = MinMaxScaler()
data[['AvgLat', 'AvgLon', 'AvgHead']] = scaler.fit_transform(data[['AvgLat', 'AvgLon', 'AvgHead']])
features = ['AvgLat', 'AvgLon', 'AvgHead']
target = ['AvgLat', 'AvgLon', 'AvgHead']

X = np.array(data[features]).reshape((data.shape[0], 1, len(features)))
y = np.array(data[target])

# Creating and compiling the LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(LSTM(64))
model.add(Dense(len(target)))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[tf.keras.metrics.MeanAbsoluteError()])
history = model.fit(X, y, epochs=50, validation_split=0.2, verbose=1)

# Predicting values
y_pred = model.predict(X)

# Setting up loss functions - overkill :)
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

# Inversing the predicted and actual values to get the proper coordinate values again
y_pred_inverse = scaler.inverse_transform(y_pred)
y_actual_inverse = scaler.inverse_transform(y)

# Mapping predictions back
data['Pred_AvgLat'] = y_pred_inverse[:, 0]
data['Pred_AvgLon'] = y_pred_inverse[:, 1]
data['Pred_AvgHead'] = y_pred_inverse[:, 2]
data['Actual_AvgLat'] = y_actual_inverse[:, 0]
data['Actual_AvgLon'] = y_actual_inverse[:, 1]
data['Actual_AvgHead'] = y_actual_inverse[:, 2]
data['Actual_Distance'] = calculate_distance(data['Actual_AvgLat'], data['Actual_AvgLon'])
data['Pred_Distance'] = calculate_distance(data['Pred_AvgLat'], data['Pred_AvgLon'])
data['Zscore_Distance'] = zscore(data['Actual_Distance'] - data['Pred_Distance'])
data['Distance_Anomaly_Level'] = data['Zscore_Distance'].apply(classify_anomaly)

# Calculating z-scores for size and ICD and assigning anomaly level based on z-score
data['Zscore_Actual_Size'] = zscore(data['Size'])
data['Zscore_Actual_IntraClusterDistance'] = zscore(data['IntraClusterDistance'])
data['Size_Anomaly_Level'] = data['Zscore_Actual_Size'].apply(classify_anomaly)
data['IntraClusterDistance_Anomaly_Level'] = data['Zscore_Actual_IntraClusterDistance'].apply(classify_anomaly)
data.drop(columns=['AvgLat', 'AvgLon', 'AvgHead'], inplace=True)

# Saving and downloading CSV
output_file_name = 'predictions_w_zscore.csv'
data.to_csv(output_file_name, index=False)
files.download(output_file_name)

data.head()


Saving clusterdata_centroids (2).csv to clusterdata_centroids (2) (2).csv
Epoch 1/50


  super().__init__(**kwargs)


[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - loss: 0.1455 - mean_absolute_error: 0.2688 - val_loss: 0.0286 - val_mean_absolute_error: 0.1310
Epoch 2/50
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0094 - mean_absolute_error: 0.0598 - val_loss: 0.0016 - val_mean_absolute_error: 0.0272
Epoch 3/50
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 4.9190e-04 - mean_absolute_error: 0.0113 - val_loss: 4.3723e-04 - val_mean_absolute_error: 0.0133
Epoch 4/50
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 2.6368e-04 - mean_absolute_error: 0.0085 - val_loss: 3.2166e-04 - val_mean_absolute_error: 0.0110
Epoch 5/50
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 1.9827e-04 - mean_absolute_error: 0.0075 - val_loss: 2.5657e-04 - val_mean_absolute_error: 0.0103
Epoch 6/50
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0.1,Unnamed: 0,Cluster,Size,IntraClusterDistance,Hour,Pred_AvgLat,Pred_AvgLon,Pred_AvgHead,Actual_AvgLat,Actual_AvgLon,Actual_AvgHead,Actual_Distance,Pred_Distance,Zscore_Distance,Distance_Anomaly_Level,Zscore_Actual_Size,Zscore_Actual_IntraClusterDistance,Size_Anomaly_Level,IntraClusterDistance_Anomaly_Level
0,2747,3,2.0,0.032238,19200.0,23.307589,116.953766,26.031189,23.313805,116.892696,25.907647,119.194949,119.253624,-2.017478,Medium,-0.279253,0.033221,Low,Low
1,4579,2,2.0,0.090952,33708.0,23.251678,117.20874,29.288467,23.255915,117.146957,29.150424,119.433024,119.492798,-2.081378,Medium,-0.279253,1.938637,Low,Low
2,4435,2,2.0,0.032699,32808.0,20.937059,112.453316,44.609005,20.979667,112.454752,44.401664,114.395007,114.385788,1.930474,Low,-0.279253,0.048192,Low,Low
3,2657,2,2.0,0.000344,18792.0,21.562311,111.877983,-1.183814,21.586326,111.821593,-1.5,113.886075,113.936897,-1.56087,Low,-0.279253,-1.001803,Low,Low
4,2035,2,2.0,0.048766,15612.0,22.678986,115.904991,-0.875366,22.704982,115.871663,-1.0,118.075224,118.102936,-0.216984,Low,-0.279253,0.569579,Low,Low
