In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import datetime
import time

In [3]:
from sklearn.svm import OneClassSVM

In [4]:
# Load your dataset
#file_path = 'Filtered_Data.xls'
file_path = 'top10_hourly.csv'
data = pd.read_csv(file_path)

In [5]:
six_grids = (5059, 5159, 5262, 5061, 5259, 6064)
mask = data['gridID'].isin(six_grids)
data = data[mask]

In [6]:
# Convert startTime to datetime
data['startTime'] = pd.to_datetime(data['startTime'])

In [7]:
# Filter the data to keep only records on or before 2013-12-22
filtered_data = data[data['startTime'] <= '2013-12-22']
filtered_data.tail()

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet
14612,6064,2013-12-21 20:00:00,402.716159,169.088563,290.953989,382.715295,4420.744152
14613,6064,2013-12-21 21:00:00,279.963939,121.01459,181.774419,204.39288,3919.924584
14614,6064,2013-12-21 22:00:00,314.778213,116.812276,132.910724,167.114616,3706.461627
14615,6064,2013-12-21 23:00:00,201.084586,143.489276,91.638965,105.45376,3030.986434
14616,6064,2013-12-22 00:00:00,97.498804,60.973281,56.722152,71.812944,2670.088054


In [8]:
# Create anomaly marker columns for relevant features
filtered_data['anomaly'] = 0
filtered_data.head()

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly
2976,5059,2013-11-01 00:00:00,177.102719,149.554856,92.687108,113.072191,3549.351436,0
2977,5059,2013-11-01 01:00:00,101.345768,159.323853,38.493504,56.650364,2511.210825,0
2978,5059,2013-11-01 02:00:00,56.312823,106.53697,31.904731,60.508954,2180.79819,0
2979,5059,2013-11-01 03:00:00,96.496178,102.286239,28.394988,104.482368,1857.653625,0
2980,5059,2013-11-01 04:00:00,94.96823,130.207161,14.962744,36.634123,2211.357051,0


In [9]:
# Splitting the dataset to train (80%) and test (20%)
# Sorting the data by 'startTime' to ensure the chronological order is maintained.                         ####

sorted_data = filtered_data.sort_values(by='startTime')

# Now, we calculate the index for the 80/20 split
split_index = int(len(sorted_data) * 0.8)

# Split the data into train and test sets
train_data = sorted_data.iloc[:split_index]
test_data = sorted_data.iloc[split_index:]

In [10]:
for grid in test_data['gridID'].unique():
    anomaly_mask_1 = (test_data['gridID'] == grid) & (test_data['startTime'].dt.date == datetime.datetime(2013, 12, 18).date()) \
                 & (test_data['startTime'].dt.hour >= 10) & (test_data['startTime'].dt.hour <= 20)
    test_data.loc[anomaly_mask_1, 'smsIn'] *= 0.1  # Drop in SMS In
    test_data.loc[anomaly_mask_1, 'anomaly'] = 2  # Mark as smsIn anomaly

    anomaly_mask_2 = (test_data['gridID'] == grid) & (test_data['startTime'].dt.date == datetime.datetime(2013, 12, 16).date()) \
                 & (test_data['startTime'].dt.hour >= 10) & (test_data['startTime'].dt.hour <= 20)
    test_data.loc[anomaly_mask_2, 'callOut'] *= 0.1  # Drop in Call Out
    test_data.loc[anomaly_mask_2, 'anomaly'] = 3  # Mark as callOut anomaly


anomaly_mask = (test_data['gridID'] == 5059) & (test_data['startTime'].dt.date == datetime.datetime(2013, 12, 14).date()) \
                 & (test_data['startTime'].dt.hour >= 10) & (test_data['startTime'].dt.hour <= 20)
test_data.loc[anomaly_mask, 'internet'] *= 2.5  # Spike in internet usage
test_data.loc[anomaly_mask, 'anomaly'] = 1  # Mark as internet anomaly

In [11]:
train_data.head()

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly
2976,5059,2013-11-01,177.102719,149.554856,92.687108,113.072191,3549.351436,0
5952,5159,2013-11-01,105.545194,52.6278,69.849807,75.098755,2553.960931,0
10416,5259,2013-11-01,188.267076,95.958775,76.498283,46.92852,5782.382873,0
11904,5262,2013-11-01,85.924412,88.363414,27.310888,21.790846,3076.323187,0
4464,5061,2013-11-01,96.26501,40.464191,49.309863,49.093111,3246.442825,0


In [12]:
test_data.head()

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly
3956,5059,2013-12-11 20:00:00,610.836958,314.077734,431.347894,570.465212,8655.849332,0
6932,5159,2013-12-11 20:00:00,304.832,160.43433,236.131107,298.689924,6043.488189,0
12884,5262,2013-12-11 20:00:00,372.332965,237.217398,214.492042,296.185859,6399.696074,0
5444,5061,2013-12-11 20:00:00,540.911612,256.452738,276.312785,286.472783,6563.453188,0
11396,5259,2013-12-11 20:00:00,445.939206,242.397584,270.895641,359.250442,8722.194369,0


In [13]:
#save test dataset to CSV files
test_file_path = 'test_dataset_abnormal.csv'
test_data.to_csv(test_file_path, index=False)

In [14]:
# Set the startTime as the index
train_data.set_index('startTime', inplace=True)
test_data.set_index('startTime', inplace=True)

In [15]:
# Create time-based features
train_data['hour'] = train_data.index.hour
train_data['day_of_week'] = train_data.index.dayofweek
train_data['day_of_month'] = train_data.index.day
train_data['month'] = train_data.index.month

test_data['hour'] = test_data.index.hour
test_data['day_of_week'] = test_data.index.dayofweek
test_data['day_of_month'] = test_data.index.day
test_data['month'] = test_data.index.month

In [16]:
# Preserve the original gridID and timestamps for anomaly visualization
train_grid_ids = train_data['gridID'].values
test_grid_ids = test_data['gridID'].values
train_timestamps = train_data.index
test_timestamps = test_data.index

In [17]:
train_data.head()

Unnamed: 0_level_0,gridID,smsIn,smsOut,callIn,callOut,internet,anomaly,hour,day_of_week,day_of_month,month
startTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-11-01,5059,177.102719,149.554856,92.687108,113.072191,3549.351436,0,0,4,1,11
2013-11-01,5159,105.545194,52.6278,69.849807,75.098755,2553.960931,0,0,4,1,11
2013-11-01,5259,188.267076,95.958775,76.498283,46.92852,5782.382873,0,0,4,1,11
2013-11-01,5262,85.924412,88.363414,27.310888,21.790846,3076.323187,0,0,4,1,11
2013-11-01,5061,96.26501,40.464191,49.309863,49.093111,3246.442825,0,0,4,1,11


In [18]:
# Normalize/Standardize the data
scaler = StandardScaler()
train_data = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns, index=train_data.index)
test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns, index=test_data.index)

In [19]:
# Drop gridID  and anomaly from the features for training and testing
train_data = train_data.drop(columns=['gridID'])
train_data = train_data.drop(columns=['anomaly'])
test_data = test_data.drop(columns=['gridID'])
test_data = test_data.drop(columns=['anomaly'])

In [20]:
def identify_anomalies(test_timestamps, test_grid_ids, test_data):
    start_time = time.time()  # Start timer

    # Train an One-Class SVM on the dataset
    svm = OneClassSVM(gamma='auto').fit(test_data)

    # Predict the anomaly scores for each data point
    scores = svm.score_samples(test_data)

    # Define a threshold for anomaly detection
    anomaly_threshold = np.percentile(scores, 20)

    # Identify anomalies
    anomaly_indices = np.where(scores < anomaly_threshold)[0]

    # Convert anomaly_indices to a list of integers
    anomaly_indices = anomaly_indices.astype(int).tolist()

    # Extract anomaly information
    anomaly_timestamps = [test_timestamps[i] for i in anomaly_indices]
    anomaly_grid_ids = test_grid_ids[anomaly_indices]
    anomaly_features = test_data.iloc[anomaly_indices].values

    end_time = time.time()  # End timer
    execution_time = end_time - start_time  # Compute execution time

    print(f"Execution Time: {execution_time:.4f} seconds")  # Print execution time

    return anomaly_timestamps, anomaly_grid_ids, anomaly_threshold, anomaly_indices, anomaly_features

In [21]:
anomaly_timestamps, anomaly_grid_ids, anomaly_threshold, anomaly_indices,anomaly_features = identify_anomalies(test_timestamps, test_grid_ids, test_data)
#
found_anomalies = pd.DataFrame()
found_anomalies['anomaly_timestamp'] = anomaly_timestamps
found_anomalies['anomaly_timestamp'] = pd.to_datetime(found_anomalies['anomaly_timestamp'], format="%Y-%m-%d %H:%M:%S")
found_anomalies['anomaly_grid'] = anomaly_grid_ids
found_anomalies.to_csv('anomalies_model.csv', index=False)


Execution Time: 0.5852 seconds


In [22]:
#Drop all anomaly columns equal to 0
injected_anomalies = pd.read_csv("test_dataset_abnormal.csv")
injected_anomalies = injected_anomalies[injected_anomalies["anomaly"] != 0]

In [23]:

found_anomalies['anomaly_timestamp'] = pd.to_datetime(found_anomalies['anomaly_timestamp'], errors='coerce')
injected_anomalies['startTime'] = pd.to_datetime(injected_anomalies['startTime'], errors='coerce')

In [24]:
found_anomalies.shape

(294, 2)

In [25]:
# Merge the two dataframes on matching GridID and Timestamp/startTime
merged_df = pd.merge(found_anomalies, injected_anomalies, left_on=['anomaly_timestamp', 'anomaly_grid'], right_on=['startTime', 'gridID'], how='inner')

In [26]:
merged_df

Unnamed: 0,anomaly_timestamp,anomaly_grid,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly
0,2013-12-14 10:00:00,5059,5059,2013-12-14 10:00:00,876.768421,385.107740,548.545813,671.959604,18605.611282,1
1,2013-12-14 11:00:00,5059,5059,2013-12-14 11:00:00,1479.136379,556.760997,941.248122,1068.046517,28787.036181,1
2,2013-12-14 12:00:00,5059,5059,2013-12-14 12:00:00,1829.513298,581.343243,1050.937450,1413.252409,38738.167569,1
3,2013-12-14 13:00:00,5059,5059,2013-12-14 13:00:00,1922.505204,896.034323,1102.463478,1290.052171,44261.699773,1
4,2013-12-14 14:00:00,5059,5059,2013-12-14 14:00:00,1789.207042,742.470167,928.660964,1149.592928,48423.608645,1
...,...,...,...,...,...,...,...,...,...,...
118,2013-12-18 17:00:00,5061,5061,2013-12-18 17:00:00,143.974377,589.547840,990.625482,985.346521,12171.806494,2
119,2013-12-18 18:00:00,5259,5259,2013-12-18 18:00:00,116.971775,687.425294,921.902875,871.946602,14083.656641,2
120,2013-12-18 18:00:00,5262,5262,2013-12-18 18:00:00,103.541068,397.723588,851.915770,990.206204,11507.953912,2
121,2013-12-18 18:00:00,6064,6064,2013-12-18 18:00:00,139.284669,573.242302,874.314790,1102.693663,10755.673651,2


In [27]:
#Anomalies found
print(f"{merged_df.shape[0]} / {injected_anomalies.shape[0]}")

123 / 143
