In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import time

In [None]:
# Load your dataset
file_path = 'top10_hourly.csv'
data = pd.read_csv(file_path)

In [None]:
six_grids = (5059, 5159, 5262, 5061, 5259, 6064)
mask = data['gridID'].isin(six_grids)
data = data[mask]

In [None]:
# Convert startTime to datetime
data['startTime'] = pd.to_datetime(data['startTime'])

In [None]:
# Filter the data to keep only records on or before 2013-12-22
filtered_data = data[data['startTime'] <= '2013-12-22']
filtered_data.tail()

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet
14612,6064,2013-12-21 20:00:00,402.716159,169.088563,290.953989,382.715295,4420.744152
14613,6064,2013-12-21 21:00:00,279.963939,121.01459,181.774419,204.39288,3919.924584
14614,6064,2013-12-21 22:00:00,314.778213,116.812276,132.910724,167.114616,3706.461627
14615,6064,2013-12-21 23:00:00,201.084586,143.489276,91.638965,105.45376,3030.986434
14616,6064,2013-12-22 00:00:00,97.498804,60.973281,56.722152,71.812944,2670.088054


In [None]:
# Create anomaly marker columns for relevant features
filtered_data['anomaly'] = 0
filtered_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['anomaly'] = 0


Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly
2976,5059,2013-11-01 00:00:00,177.102719,149.554856,92.687108,113.072191,3549.351436,0
2977,5059,2013-11-01 01:00:00,101.345768,159.323853,38.493504,56.650364,2511.210825,0
2978,5059,2013-11-01 02:00:00,56.312823,106.53697,31.904731,60.508954,2180.79819,0
2979,5059,2013-11-01 03:00:00,96.496178,102.286239,28.394988,104.482368,1857.653625,0
2980,5059,2013-11-01 04:00:00,94.96823,130.207161,14.962744,36.634123,2211.357051,0


In [None]:
# Splitting the dataset to train (80%) and test (20%)
# Sorting the data by 'startTime' to ensure the chronological order is maintained.                         ####

sorted_data = filtered_data.sort_values(by='startTime')

# Now, we calculate the index for the 80/20 split
split_index = int(len(sorted_data) * 0.8)

# Split the data into train and test sets
train_data = sorted_data.iloc[:split_index]
test_data = sorted_data.iloc[split_index:]

In [None]:
date_smsIn = datetime.datetime(2013, 12, 18).date()
date_callOut = datetime.datetime(2013, 12, 16).date()
date_internet = datetime.datetime(2013, 12, 14).date()

for grid in test_data['gridID'].unique():
    anomaly_mask_1 = (test_data['gridID'] == grid) & (test_data['startTime'].dt.date == date_smsIn) \
                 & (test_data['startTime'].dt.hour >= 10) & (test_data['startTime'].dt.hour <= 20)
    test_data.loc[anomaly_mask_1, 'smsIn'] *= 0.1  # Drop in SMS In
    test_data.loc[anomaly_mask_1, 'anomaly'] = 2  # Mark as smsIn anomaly

    anomaly_mask_2 = (test_data['gridID'] == grid) & (test_data['startTime'].dt.date == date_callOut) \
                 & (test_data['startTime'].dt.hour >= 10) & (test_data['startTime'].dt.hour <= 20)
    test_data.loc[anomaly_mask_2, 'callOut'] *= 0.1  # Drop in Call Out
    test_data.loc[anomaly_mask_2, 'anomaly'] = 3  # Mark as callOut anomaly


anomaly_mask = (test_data['gridID'] == 5059) & (test_data['startTime'].dt.date == date_internet) \
                 & (test_data['startTime'].dt.hour >= 10) & (test_data['startTime'].dt.hour <= 20)
test_data.loc[anomaly_mask, 'internet'] *= 2.5  # Spike in internet usage
test_data.loc[anomaly_mask, 'anomaly'] = 1  # Mark as internet anomaly

In [None]:
test_data_abnormal = test_data.copy()

In [None]:
test_data_abnormal.head()

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly
3956,5059,2013-12-11 20:00:00,610.836958,314.077734,431.347894,570.465212,8655.849332,0
6932,5159,2013-12-11 20:00:00,304.832,160.43433,236.131107,298.689924,6043.488189,0
12884,5262,2013-12-11 20:00:00,372.332965,237.217398,214.492042,296.185859,6399.696074,0
5444,5061,2013-12-11 20:00:00,540.911612,256.452738,276.312785,286.472783,6563.453188,0
11396,5259,2013-12-11 20:00:00,445.939206,242.397584,270.895641,359.250442,8722.194369,0


In [None]:
test_data_abnormal[test_data_abnormal['anomaly'] !=0]

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly
4018,5059,2013-12-14 10:00:00,876.768421,385.107740,548.545813,671.959604,18605.611282,1
4019,5059,2013-12-14 11:00:00,1479.136379,556.760997,941.248122,1068.046517,28787.036181,1
4020,5059,2013-12-14 12:00:00,1829.513298,581.343243,1050.937450,1413.252409,38738.167569,1
4021,5059,2013-12-14 13:00:00,1922.505204,896.034323,1102.463478,1290.052171,44261.699773,1
4022,5059,2013-12-14 14:00:00,1789.207042,742.470167,928.660964,1149.592928,48423.608645,1
...,...,...,...,...,...,...,...,...
7100,5159,2013-12-18 20:00:00,41.315886,203.592291,237.206312,318.660442,5310.634604,2
4124,5059,2013-12-18 20:00:00,64.205901,381.568421,366.872152,550.024098,7004.802736,2
13052,5262,2013-12-18 20:00:00,41.573927,266.296170,279.372861,359.767609,6056.058536,2
5612,5061,2013-12-18 20:00:00,56.204301,265.853730,387.007308,391.540026,6905.431302,2


In [None]:
test_data_abnormal[test_data_abnormal['anomaly'] !=0].shape[0]

143

In [None]:
start_time = time.time()  # Start timer
test_data_abnormal['z_score_internet'] = (test_data_abnormal['internet'] - test_data_abnormal['internet'].mean()) / test_data_abnormal['internet'].std()
test_data_outliers_internet = test_data_abnormal[((test_data_abnormal.z_score_internet > 3) | (test_data_abnormal.z_score_internet < -3)) & (test_data_abnormal['anomaly'] == 1)]
test_data_outliers_internet

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly,z_score_internet
4019,5059,2013-12-14 11:00:00,1479.136379,556.760997,941.248122,1068.046517,28787.036181,1,3.565885
4020,5059,2013-12-14 12:00:00,1829.513298,581.343243,1050.93745,1413.252409,38738.167569,1,5.214294
4021,5059,2013-12-14 13:00:00,1922.505204,896.034323,1102.463478,1290.052171,44261.699773,1,6.12927
4022,5059,2013-12-14 14:00:00,1789.207042,742.470167,928.660964,1149.592928,48423.608645,1,6.818692
4023,5059,2013-12-14 15:00:00,1995.964117,827.534748,1151.93363,1428.129267,46455.539809,1,6.49268
4024,5059,2013-12-14 16:00:00,2574.084658,1003.645021,1292.320932,1728.981653,48361.531999,1,6.808409
4025,5059,2013-12-14 17:00:00,2457.668355,758.452259,1245.438764,1740.500048,43081.053201,1,5.933695
4026,5059,2013-12-14 18:00:00,2116.96833,733.389263,1194.090082,1417.360181,42876.388437,1,5.899792
4027,5059,2013-12-14 19:00:00,1425.306129,550.077523,849.887673,1045.667494,36597.148693,1,4.859633
4028,5059,2013-12-14 20:00:00,795.811655,317.051882,402.925775,517.761024,28145.822051,1,3.459667


In [None]:
test_data_outliers_smsIn = pd.DataFrame()
for grid in test_data_abnormal['gridID'].unique():
  test_data = test_data_abnormal[test_data_abnormal['gridID'] == grid]
  z_score_smsIn = (test_data['smsIn'] - test_data['smsIn'].mean()) / test_data['smsIn'].std()
  outliers_smsIn = test_data[(z_score_smsIn > 3) | (z_score_smsIn < -3) & (test_data['anomaly'] == 2)]
  test_data_outliers_smsIn = pd.concat([test_data_outliers_smsIn, outliers_smsIn])

test_data_outliers_smsIn

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly,z_score_internet
14483,6064,2013-12-16 11:00:00,2731.595686,1013.498572,2014.727268,241.99719,11831.893878,3,0.757257


In [None]:
test_data_outliers_callOut = pd.DataFrame()
for grid in test_data_abnormal['gridID'].unique():
  test_data = test_data_abnormal[test_data_abnormal['gridID'] == grid]
  z_score_callOut = (test_data['callOut'] - test_data['callOut'].mean()) / test_data['callOut'].std()
  outliers_callOut = test_data[(z_score_callOut > 3) | (z_score_callOut < -3) & (test_data['anomaly'] == 3)]
  test_data_outliers_callOut = pd.concat([test_data_outliers_callOut, outliers_callOut])

test_data_outliers_callOut

Unnamed: 0,gridID,startTime,smsIn,smsOut,callIn,callOut,internet,anomaly,z_score_internet


In [None]:
end_time = time.time()  # End timer
execution_time = end_time - start_time  # Compute execution time

print(f"Execution Time: {execution_time:.4f} seconds")  # Print execution time

Execution Time: 0.2153 seconds


In [None]:
all_outliers = test_data_outliers_internet.shape[0] + test_data_outliers_smsIn.shape[0] + test_data_outliers_callOut.shape[0]
all_outliers
#

11

In [None]:
print(f'Found outliers {all_outliers} out of {test_data_abnormal[test_data_abnormal["anomaly"] != 0].shape[0]}')

Found outliers 11 out of 143
