In [1]:
import dask as da
import dask.dataframe as daskdf
import dask.array as daa
import dask.distributed as dd
import dask.datasets as ds
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import seaborn as sns
import sklearn as sk
import numpy as np
import dask_ml.preprocessing as dm_pre
import dask_ml.cluster as dm_cluster
import dask_geopandas as dg
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import geopandas as gpd
from scipy.stats import pearsonr
import time

# Phase Four: Modeling. 
This phase involves selecting, applying, and tuning various statistical or machine learning models to your prepared data. Here's what typically happens in this phase:

In [2]:
# load preprocessed data
"""data head look like this: 
   mapped_veh_id       timestamps_UTC  RS_E_InAirTemp_PC1  RS_E_InAirTemp_PC2  \
0          112.0  2023-08-01 11:42:55              309.15              313.15   
1          179.0  2023-08-24 17:48:07              314.15              303.15   
2          190.0  2023-08-01 12:27:13              307.15              313.15   
3          179.0  2023-08-24 18:18:05              311.15              304.15   
4          122.0  2023-08-01 12:53:12              303.15              304.15   

   RS_E_OilPress_PC1  RS_E_OilPress_PC2  RS_E_RPM_PC1  RS_E_RPM_PC2  \
0              276.0              248.0         798.0         797.0   
1              220.0              227.0         796.0         798.0   
2              376.0              317.0        1236.0        1214.0   
3              220.0              220.0         802.0         799.0   
4              193.0              269.0         801.0         798.0   

   RS_E_WatTemp_PC1  RS_E_WatTemp_PC2  RS_T_OilTemp_PC1  RS_T_OilTemp_PC2  \
0            346.15            355.15            351.15            355.15   
1            355.15            356.15            352.15            355.15   
2            356.15            350.15            357.15            354.15   
3            355.15            355.15            353.15            355.15   
4            352.15            352.15            350.15            350.15   

   tempmax  tempmin  temp  
0     19.7     15.4  17.4  
...
2     19.7     15.1  17.2  
3     24.1     15.9  19.8  
4     19.7     15.4  17.4  
"""
# Load data
def load_data():
    print("Loading data...")
    start = time.time()
    df = daskdf.read_csv("preprocessed.csv")
    end = time.time()
    print("Data loaded in {} seconds".format(end-start))
    return df

In [3]:
ddf = load_data()
ddf = ddf.drop(columns=["Unnamed: 0"])
print(ddf.head())
print(len(ddf))

Loading data...
Data loaded in 0.09375190734863281 seconds
   mapped_veh_id       timestamps_UTC  RS_E_InAirTemp_PC1  RS_E_InAirTemp_PC2  \
0          112.0  2023-08-01 11:42:55              309.15              313.15   
1          179.0  2023-08-24 17:48:07              314.15              303.15   
2          190.0  2023-08-01 12:27:13              307.15              313.15   
3          179.0  2023-08-24 18:18:05              311.15              304.15   
4          122.0  2023-08-01 12:53:12              303.15              304.15   

   RS_E_OilPress_PC1  RS_E_OilPress_PC2  RS_E_RPM_PC1  RS_E_RPM_PC2  \
0              276.0              248.0         798.0         797.0   
1              220.0              227.0         796.0         798.0   
2              376.0              317.0        1236.0        1214.0   
3              220.0              220.0         802.0         799.0   
4              193.0              269.0         801.0         798.0   

   RS_E_WatTemp_PC1  RS_E_W

In [4]:
# Unsuppervised Modeling techniques selection
# 1. Isolation Forest: Effective for high-dimensional datasets. It isolates anomalies instead of modeling normal points, which is efficient when anomalies are rare.

# 2. Local Outlier Factor (LOF): It is an unsupervised outlier detection method which computes the local density deviation of a given data point with respect to its neighbors. It considers as outliers the samples that have a substantially lower density than their neighbors. can less effectively handle datasets with varying densities and lof is not well suited for high dimensional data. and it cannot handle the noise data. 

# 3. One-Class SVM: It is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set. It is based on the idea that a normal data point is surrounded by similar data points, while abnormal data points are far from their neighbors. It is effective in high dimensional spaces and it is memory efficient. It is not suited for large datasets and it does not perform well when the proportion of outliers is high.

# 4. DBSCAN: It is a density-based clustering algorithm that groups together points that are close to each other based on a distance measurement (usually Euclidean distance). It is effective for data which contains clusters of similar density. It is not suited for datasets with varying densities and it cannot handle noisy data.


# 2. Local Outlier Factor 

In [5]:
#  LOF Model Implementation

from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

# feature selection
# Since LOF is a distance-based model, it is not suitable for high-dimensional datasets. So we will drop highly correlated features.
# Drop highly correlated features based on Pearso matrix previously calculated in data preprocessing
ddf = ddf.drop(columns=['RS_E_InAirTemp_PC2', 'RS_E_OilPress_PC2', 'RS_E_RPM_PC2', 'RS_E_WatTemp_PC2', 'RS_T_OilTemp_PC2'])

# Drop non-feature columns
ddf = ddf.drop(columns=['tempmax', 'tempmin'])

ddf['timestamps_UTC'] = (ddf['timestamps_UTC'].astype('datetime64[ns]') - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Normalize the feature columns
scaler = StandardScaler()
features_to_normalize = ddf.columns
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

# Initialize LOF
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.00001)

# Fit the model (excluding non-feature columns)
df['lof_anomalies'] = lof.fit_predict(df[features_to_normalize])
df['lof_anomalies'] = df['lof_anomalies'].map({1: 0, -1: 1})  # Convert to 0 for normal, 1 for anomaly

# Review the detected anomalies
anomalies_lof = df[df['lof_anomalies'] == 1]
print(anomalies_lof)

# Output the anomalies to a new CSV for further analysis
anomalies_lof.to_csv('anomalies_lof.csv')




NameError: name 'df' is not defined

In [None]:
# Plot the anomalies with legend of each feature
for feature in df.drop(columns=['lof_anomalies']).columns:
    plt.figure(figsize=(10, 5))
    plt.title(feature)
    plt.plot(df[feature],'b.', label='Normal')
    plt.plot(anomalies_lof[feature], 'ro', label='Anomaly')
    plt.legend()
    plt.show()


    

In [None]:
# Evaluate the model on sample 1
sample1_predictions = lof.fit_predict(sample1_features)
sample1_predictions = pd.Series(sample1_predictions, index=sample1_features.index).map({1: 0, -1: 1})
sample1_metrics = {
    'Accuracy': sk.accuracy_score(sample1_labels, sample1_predictions),
    'Precision': sk.precision_score(sample1_labels, sample1_predictions),
    'Recall': sk.recall_score(sample1_labels, sample1_predictions),
    'F1 Score': sk.f1_score(sample1_labels, sample1_predictions),
    'Confusion Matrix': sk.confusion_matrix(sample1_labels, sample1_predictions)
}

# Evaluate the model on sample 2
sample2_predictions = lof.fit_predict(sample2_features)
sample2_predictions = pd.Series(sample2_predictions, index=sample2_features.index).map({1: 0, -1: 1})
sample2_metrics = {
    'Accuracy': sk.accuracy_score(sample2_labels, sample2_predictions),
    'Precision': sk.precision_score(sample2_labels, sample2_predictions),
    'Recall': sk.recall_score(sample2_labels, sample2_predictions),
    'F1 Score': sk.f1_score(sample2_labels, sample2_predictions),
    'Confusion Matrix': sk.confusion_matrix(sample2_labels, sample2_predictions)
}

# Print the results
print('Sample 1 Metrics:', sample1_metrics)
print('\nSample 2 Metrics:', sample2_metrics)
