In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

data = pd.read_csv('login_data.csv')

if 'user_id' not in data.columns:
    data['user_id'] = range(1, len(data) + 1)

features = data[['login_hour', 'ip_risk_score', 'location_change', 'device_change']]

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
data['cluster'] = kmeans.fit_predict(features_scaled)

distances = kmeans.transform(features_scaled)
data['distance_to_center'] = np.min(distances, axis=1)

threshold = data['distance_to_center'].quantile(0.80)
data['anomaly'] = data['distance_to_center'] > threshold
data['anomaly_label'] = data['anomaly'].map({True: 'Anomalous', False: 'Normal'})

print(data[['user_id', 'login_hour', 'ip_risk_score', 'location_change', 'device_change', 'anomaly_label']])

   user_id  login_hour  ip_risk_score  location_change  device_change  \
0        1           9             10                0              0   
1        2          18             40                0              0   
2        3           2             80                1              1   
3        4          23             90                1              1   
4        5          10             15                0              0   
5        6          11             12                0              0   
6        7           1             75                1              0   
7        8          22             85                1              1   
8        9           8             10                0              0   
9       10          13             18                0              0   

  anomaly_label  
0        Normal  
1        Normal  
2        Normal  
3     Anomalous  
4        Normal  
5        Normal  
6     Anomalous  
7        Normal  
8        Normal  
9        Normal 