In [None]:
# importing libs and loading data
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

df = pd.read_csv("../data/synthetic_logins.csv") 

# shows first five rows to make sure the stucture is correct
df.head()

In [None]:
df.info()
df.describe(include="all") # shows numerical summaries
df["label"].value_counts() # the number of anomalies present

In [None]:
# converting timestamps to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [None]:
# plotting the number of logins overtime to se4e if activity is random or periodic 
plt.figure(figsize=(12,5))
df.groupby(df["timestamp"].dt.date).size().plot(kind="line")
plt.title("logins per day")
plt.xlabel("Date")
plt.ylabel("count")
plt.show


In [None]:
# visualizing normal vs anomalous distribution
sns.countplot(data=df, x="label")
plt.title("class Distribution")
plt.show()

In [None]:
# comparing session duration
plt.figure(figsize=(12,5))
sns.boxplot(data=df, x="label", y="session_duration")
plt.title("Session Duration Comparison")
plt.show()

In [None]:
# failed attempts distibution / Bruteforce anomaly detection
plt.figure(figsize=(12,5))
sns.boxplot(data=df, x="label", y="failed_attempts")
plt.title("Failed attempts (normal vs anomalous)")
plt.show()

In [None]:
# location distribution
plt.figure(figsize=(12,6))
sns.countplot(data=df, y="location", hue="label")
plt.title("Location vs Label")
plt.show()

In [None]:
# suspicious IP address count
df_ip = df[df["label"]=="anomalous"]
df_ip["ip"].value_counts().head(10)

In [None]:
# correlation heatmap to show which numberic values relate to anomalies
num_cols = ["login_success", "failed_attempts", "session_duration"]
corr_matrix = df[num_cols].corr()

plt.figure(figsize=(8,5))
sns.heatmap(corr_matrix, annot=True, cmap="Reds")
plt.title("correlation heatmap")
plt.show()

In [None]:
# outlier detection to see weird sessions
plt.figure(figsize=(12,5))
sns.boxplot(data=df, y="session_duration")
plt.title("Session Duration Outlier Overview")
plt.show()