In [24]:
import tensorflow as tf
from tensorflow import keras
import os
import tempfile

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [25]:
# Set up matplotlib
mpl.rcParams['figure.figsize'] = (12, 10)
# tuple where 12 is the width and 10 is the height, measured in inches.
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [26]:
# load the data
file = tf.keras.utils
raw_df = pd.read_csv(os.path.join("data", "flight_data_processed.csv"))
raw_df = raw_df[["gps_altitude_m","speed_km/s","climb_m","climb_m(delta)","climb_rate_m/s","bearing","delta_bearing","glide_ratio","elapsed_time","temp","pressure","humidity","dew_point","wind_speed","wind_deg", "zone"]]
raw_df["zone"] = raw_df["zone"].apply(lambda x: 1 if x == "thermal" else 0)
raw_df.head()

  raw_df = pd.read_csv(os.path.join("data", "flight_data_processed.csv"))


Unnamed: 0,gps_altitude_m,speed_km/s,climb_m,climb_m(delta),climb_rate_m/s,bearing,delta_bearing,glide_ratio,elapsed_time,temp,pressure,humidity,dew_point,wind_speed,wind_deg,zone
0,1115,12.218615,0.0,-3.0,0.0,303,67.0,0.0,0.0,26.69,1010.0,37.0,10.81,3.38,306.0,0
1,1115,25.606488,0.0,-3.0,0.0,270,33.0,0.0,1.0,26.69,1010.0,37.0,10.81,3.38,306.0,0
2,1115,20.48519,0.0,-3.0,0.0,270,0.0,0.0,2.0,26.69,1010.0,37.0,10.81,3.38,306.0,0
3,1116,25.606488,1.0,-2.0,1.0,270,0.0,7.112913,3.0,26.69,1010.0,37.0,10.81,3.38,306.0,1
4,1117,21.541278,1.0,-1.0,1.0,288,18.0,5.983688,4.0,26.69,1010.0,37.0,10.81,3.38,306.0,1


In [27]:
raw_df.describe()

Unnamed: 0,gps_altitude_m,speed_km/s,climb_m,climb_m(delta),climb_rate_m/s,bearing,delta_bearing,glide_ratio,elapsed_time,temp,pressure,humidity,dew_point,wind_speed,wind_deg,zone
count,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0,173943.0
mean,2549.013677,52.049776,-0.031964,-0.632339,-0.026192,128.208505,11.336541,8.336186,16822.956221,31.048488,1006.83801,30.089995,10.966365,3.138136,229.898335,0.524154
std,621.275795,25.169579,2.587599,44.476908,1.869562,93.148811,17.172823,8.480289,9743.969272,2.945894,2.086364,7.442854,2.804663,1.262021,100.911881,0.499418
min,376.0,0.0,-30.0,-383.0,-11.0,0.0,0.0,0.0,0.0,24.16,1003.0,14.0,3.19,0.5,0.0,0.0
25%,2160.0,41.296352,-1.0,-26.0,-1.0,47.0,2.0,3.082997,8427.0,28.92,1005.0,23.0,9.7,2.02,202.0,0.0
50%,2551.0,53.539497,0.0,-1.0,0.0,122.0,6.0,7.368595,16791.0,30.61,1007.0,33.0,11.25,3.37,234.0,1.0
75%,2937.0,63.605984,1.0,23.0,1.0,184.0,14.0,12.795062,25154.0,33.81,1008.0,36.0,13.01,4.22,316.0,1.0
max,4403.0,4788.98745,44.0,1222.0,44.0,358.0,180.0,1306.48164,36656.0,37.13,1015.0,42.0,17.5,5.51,358.0,1.0


In [28]:
neg, pos = np.bincount(raw_df["zone"])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 173943
    Positive: 91173 (52.42% of total)



In [29]:
cleaned_df = raw_df.copy()

# Use a utility from sklearn to split and shuffle your dataset.
train_df, test_df = train_test_split(cleaned_df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)

# Form np arrays of labels and features.
train_labels = np.array(train_df.pop("zone"))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop("zone"))
test_labels = np.array(test_df.pop("zone"))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (111323,)
Validation labels shape: (27831,)
Test labels shape: (34789,)
Training features shape: (111323, 15)
Validation features shape: (27831, 15)
Test features shape: (34789, 15)


In [None]:
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns=train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns=train_df.columns)

sns.jointplot(x=pos_df["speed_km/s"], y=pos_df["climb_m"],
              kind='hex', xlim=(-5,5), ylim=(-5,5))
plt.suptitle("Positive distribution")

sns.jointplot(x=neg_df["speed_km/s"], y=neg_df["climb_m"],
              kind='hex', xlim=(-5,5), ylim=(-5,5))
_ = plt.suptitle("Negative distribution")

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

def make_model(metrics=METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(
          16, activation='relu',
          input_shape=(train_features.shape[-1],)),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
  ])

  model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

  return model


EPOCHS = 100
BATCH_SIZE = 2048

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_prc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)


model = make_model()
model.summary()

In [None]:
model.predict(train_features[:10])

In [None]:
results = model.evaluate(train_features, train_labels, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

In [None]:
initial_bias = np.log([pos/neg])
initial_bias

In [None]:
model = make_model(output_bias=initial_bias)
model.predict(train_features[:10])

In [None]:
results = model.evaluate(train_features, train_labels, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

In [None]:
initial_weights = os.path.join(tempfile.mkdtemp(), "initial_weights.weights.h5")
model.save_weights(initial_weights)

model = make_model()
model.load_weights(initial_weights)
model.layers[-1].bias.assign([0.0])
zero_bias_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_data=(val_features, val_labels), 
    verbose=0)


model = make_model()
model.load_weights(initial_weights)
careful_bias_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_data=(val_features, val_labels), 
    verbose=0)

def plot_loss(history, label, n):
  # Use a log scale on y-axis to show the wide range of values.
  plt.semilogy(history.epoch, history.history['loss'],
               color=colors[n], label='Train ' + label)
  plt.semilogy(history.epoch, history.history['val_loss'],
               color=colors[n], label='Val ' + label,
               linestyle="--")
  plt.xlabel('Epoch')
  plt.ylabel('Loss')

plot_loss(zero_bias_history, "Zero Bias", 0)
plot_loss(careful_bias_history, "Careful Bias", 1)

In [None]:
model = make_model()
model.load_weights(initial_weights)
baseline_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels))

In [None]:
def plot_metrics(history):
  metrics = ['loss', 'prc', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend();

plot_metrics(baseline_history)

In [None]:
train_predictions_baseline = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE)

In [None]:
def plot_cm(labels, predictions, p=0.5):
  cm = confusion_matrix(labels, predictions > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.2f}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

  print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
  print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
  print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
  print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
  print('Total Fraudulent Transactions: ', np.sum(cm[1]))

baseline_results = model.evaluate(test_features, test_labels,
                                  batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, baseline_results):
  print(name, ': ', value)
print()

plot_cm(test_labels, test_predictions_baseline)