In [1]:
import numpy as np
import csv
from utils import absolute_path
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from position_fix_utils import filter_by_date, smooth_trajectory, distance_between_points
from gb_spm import characteristic_indices, significant_place_mining
import webbrowser
from LabelPlot import LabelPlot
from ipyleaflet import (
    Map,
    CircleMarker,
    Polyline,
    Popup,
    Marker,
)
import ipywidgets as widgets
from MapPlot import MapPlot
from scipy import stats
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Input
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


%reload_ext autoreload
%autoreload 2

In [2]:
# load data from csv

position_fix_dtype = np.dtype([
    ('lat', np.float64),
    ('lon', np.float64),
    ('time', np.float64),
    ('altitude', np.float64),
    ('bearing', np.float64),
    ('speed', np.float64),
    ('accuracy', np.float64),
    ('vertical_accuracy', np.float64),
    ('bearing_accuracy', np.float64),
    ('speed_accuracy', np.float64),
])


def position_fix_from_csv(file_path, remove_duplicates=True):
    data = []
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        prior = None
        for row in reader:
            point = (
                float(row['latitude']),
                float(row['longitude']),
                float(row['create_time_epoch']),
                float(row['altitude']),
                float(row['bearing']),
                float(row['speed']),
                float(row['accuracy']),
                float(row['vertical_accuracy']),
                float(row['bearing_accuracy']),
                float(row['speed_accuracy']),
            )
            if remove_duplicates:
                if prior is not None and prior == point:
                    continue
                else:
                    prior = point
            data.append(point)
    return np.array(data, dtype=position_fix_dtype)


data_path = absolute_path("andrew-device-locations-all.csv")
location_data = position_fix_from_csv(data_path)
print('done')

In [3]:
# Get all labeled date data
def get_data(date):
    day_data = filter_by_date(location_data, date)
    
    file_path = date.strftime("%Y-%m-%d") + "-labels.csv"
    try:
        labels = np.loadtxt(file_path, delimiter=',').astype(int)
        assert len(day_data) == len(labels)
    except:
        print(f"Error: {date.strftime('%Y-%m-%d')} not labeled.")
        labels = None
    
    return day_data, labels

dates = [
    datetime(2024, 4, 19), 
    datetime(2024, 4, 20),
    datetime(2024, 5, 11),
]
X = []
y = []
for date in dates:
    Xi, yi = get_data(date)
    X.append(Xi)
    y.append(yi)
X = np.concatenate(X)
y = np.concatenate(y)
print("done")


In [4]:
# Preprocess data

# Include prior and posterior lengths in X
def extract_features(X, y, mode='valid', r_index=1): 
    fields = ['bearing', 'speed', 'accuracy', 'vertical_accuracy', 'bearing_accuracy', 'speed_accuracy'] # 'altitude'
    X_features = np.column_stack([X[field] for field in fields])
    if r_index <= 0:
        return X_features, y

    distance = distance_between_points(X, unit='m')
    distance = distance.reshape((len(distance), 1))
    time_diff = np.diff(X['time'])    
    
    if mode == 'valid':
        X_features = [X_features[r_index:-r_index]]
        n = len(X) - 2 * r_index
        X_features += [distance[i:i+n] for i in range(2 * r_index)]
        X_features += [time_diff[i:i+n] for i in range(2 * r_index)]
        X_features = np.column_stack(X_features)
        y_correct_length = y[r_index:-r_index] if y is not None else y
    elif mode == 'edge':
        raise NotImplementedError("Edge mode is not implemented.")
    else:
        raise ValueError(f"{mode} mode not supported.")

    return X_features, y_correct_length

r_index=4
X_features, y_length = extract_features(X, y, 'valid', r_index)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_length, test_size=0.2, random_state=42)

# Scale data. Avoid data snooping.
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print("done")


In [5]:
# Create a Sequential model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  
    Dense(128, activation='relu'),     
    BatchNormalization(),              
    Dense(128, activation='relu'),     
    Dropout(0.2),                      
    Dense(1, activation='sigmoid')     
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# For unbalanced classes
unique_classes, class_counts = np.unique(y_train, return_counts=True)
class_weights = {cls: len(y_train) / (len(unique_classes) * count) for cls, count in zip(unique_classes, class_counts)}

# Train the model
model.fit(X_train, y_train, epochs=300, batch_size=4000, class_weight=class_weights)
pass


In [6]:
def show_confusion_matrix(model, X, y):
    # Predict classes for the test set
    y_pred = model.predict(X)
    f1 = f1_score(y, y_pred.round())
    print("F1 score:", f1)
    
    # Convert predicted probabilities to class labels
    y_pred_classes = (y_pred > 0.5).astype(int)
    
    # Calculate confusion matrix
    cm = confusion_matrix(y, y_pred_classes)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

show_confusion_matrix(model, X_test, y_test)
show_confusion_matrix(model, X_train, y_train)

In [7]:
# Display results for a given day
date = datetime(2024, 4, 18)
plot_data, real_labels = get_data(date)

# gb-spm to find significant places
def gb_spm(data):
    smoothed = smooth_trajectory(data, s=5e-11 * len(data), weight='inverse')
    cp_indices = characteristic_indices(smoothed, 4, 1)  # [45:47]
    characteristic_points = data[cp_indices]
    significant_places = significant_place_mining(smoothed, cp_indices, 3, 0.25, 120, 60)

    return significant_places

features, labels = extract_features(plot_data, real_labels, 'valid', r_index)
features_scaled = scaler.transform(features)
pred_labels = model.predict(features_scaled)

label_plot = LabelPlot()
label_plot.add_curve(plot_data)
label_plot.add_stop_regions(gb_spm(plot_data), markers=True, color='magenta', draggable=True)
label_plot.add_points_clickable(plot_data[r_index:-r_index], labels=pred_labels)
label_plot