In [3]:
# For working with the data
import pandas as pd
import numpy as np

# For encoding the Browser Name and Version column
import hashlib

# For scaling the data
from sklearn.preprocessing import StandardScaler

# For reducing the number of dimensions 
from sklearn.decomposition import PCA

# For creating the model for anomaly detection
from sklearn.cluster import KMeans 
from sklearn.cluster import DBSCAN 

# For visualizing the data and results
import matplotlib.pyplot as plt
import seaborn as sns

# For exporting the model
import joblib

In [4]:
# Load your dataset
df = pd.read_csv("../data/refined_data.csv")

In [5]:
# Encoding the Device Type column
one_hot_encoded = pd.get_dummies(df['Device Type'], prefix='Device')
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop(["Device Type"], inplace=True, axis=1)

# Encoding the Login Successful column
df['Login Successful'] = df['Login Successful'].astype(int)

def custom_hash_function(browser_name, version):
    string_to_hash = browser_name + version
    hash_object = hashlib.md5(string_to_hash.encode())
    hash_integer = int(hash_object.hexdigest(), 16) & 0xffffffff
    return hash_integer

# Apply custom hash function to create 'Browser Info' column
df['Browser Info'] = df.apply(lambda row: custom_hash_function(row['Browser Name'], row['Version']), axis=1)

# Drop the 'Browser Name' and 'Version' columns
df = df.drop(['Browser Name', 'Version'], axis=1)

In [6]:
# Perform frequency encoding on the "User ID" column
freq_encoding = df['User ID'].value_counts().to_dict()
df['User ID'] = df['User ID'].map(freq_encoding)

# Separating IP address column into four octets
df[['IP_Octet1', 'IP_Octet2', 'IP_Octet3', 'IP_Octet4']] = df['IP Address'].str.split('.', expand=True)
df.drop(["IP Address"], inplace=True, axis=1)

df['IP_Octet1'] = df['IP_Octet1'].astype(int)
df['IP_Octet2'] = df['IP_Octet2'].astype(int)
df['IP_Octet3'] = df['IP_Octet3'].astype(int)
df['IP_Octet4'] = df['IP_Octet4'].astype(int)

In [13]:
country_buckets = 200
region_buckets = 4000
city_buckets = 10000

# Define a helper function for consistent hashing
def consistent_hash(value, buckets):
    sha256 = hashlib.sha256()
    sha256.update(value.encode('utf-8'))
    hash_value = int(sha256.hexdigest(), 16)
    return hash_value % buckets

# Apply consistent hashing to the columns
df['Country'] = df['Country'].apply(lambda x: consistent_hash(x, country_buckets))
df['Region'] = df['Region'].apply(lambda x: consistent_hash(x, region_buckets))
df['City'] = df['City'].apply(lambda x: consistent_hash(x, city_buckets))

In [15]:
df.head()

Unnamed: 0,User ID,Country,Region,City,Login Successful,Year,Month,Day,Hour,Minute,...,Device_bot,Device_desktop,Device_mobile,Device_tablet,Device_unknown,Browser Info,IP_Octet1,IP_Octet2,IP_Octet3,IP_Octet4
0,20,2,666,7320,1,2020,2,3,12,43,...,False,False,True,False,False,2151221952,81,167,144,58
1,3,21,1167,6371,0,2020,2,3,12,43,...,False,False,True,False,False,1976883002,10,0,0,47
2,8,2,988,6988,1,2020,2,3,12,44,...,False,True,False,False,False,4206724864,80,202,228,214
3,5,21,988,6988,0,2020,2,3,12,44,...,False,False,True,False,False,99184204,170,39,78,177
4,535,2,988,6988,0,2020,2,3,12,44,...,False,False,True,False,False,430040412,10,0,61,212


In [14]:
# Displaying the number of entries in the final dataset
print("Number of entries:", format(len(df), ","))

Number of entries: 17,243,365


In [16]:
# Scale the data using StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [17]:
# Perform PCA on the scaled data
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

In [18]:
max_clusters = 30  # Set the maximum number of clusters to consider
for n_clusters in range(15, max_clusters + 1, 2):
    kmeans = KMeans(n_clusters=n_clusters, n_init="auto")
    kmeans.fit(pca_data)
    wcss_value = kmeans.inertia_

    print(n_clusters, ":", wcss_value)

15 : 2124029.191495523
17 : 1896835.882119525
19 : 1750453.527383565
21 : 1541831.4659493084
23 : 1442162.122555242
25 : 1322136.9328123694
27 : 1191892.3984913942
29 : 1110811.9510324877


In [19]:
# Training the K-means model
kmeans = KMeans(n_clusters=23, n_init="auto")  # n_clusters were chosen based on the hybrid approach
kmeans.fit(scaled_data)
kmeans_clusters = kmeans.predict(scaled_data)

# Initialize an empty list to store anomalies
anomalies = []

# Perform DBSCAN on each K-means cluster
for cluster_label in set(kmeans_clusters):
    cluster_data = scaled_data[kmeans_clusters == cluster_label]

    # Perform DBSCAN
    dbscan = DBSCAN(eps=0.075, min_samples=2)
    dbscan_clusters = dbscan.fit_predict(cluster_data)

    # Identify anomalies as data points with a cluster label of -1
    cluster_anomalies = cluster_data[dbscan_clusters == -1]
    anomalies.extend(cluster_anomalies)

anomalies = np.array(anomalies)

# Classify non-anomalies as the remaining data points
non_anomalies = scaled_data[np.isin(scaled_data, anomalies, invert=True).all(axis=1)]

# Plot countplot of anomalies vs. non-anomalies
plt.figure(figsize=(8, 6))
ax = sns.countplot(x=["Non-Anomaly"] * len(non_anomalies) + ["Anomaly"] * len(anomalies))
plt.xlabel("Anomaly")
plt.ylabel("Count")
plt.title("Anomaly Countplot")

# Display the count on the countplot bins
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

# Export the number of anomalies and non-anomalies
print("Anomaly Count:", len(anomalies))
print("Non-Anomaly Count:", len(non_anomalies))
