In [2]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.0-cp312-cp312-win_amd64.whl (10.9 MB)
   ---------------------------------------- 0.0/10.9 MB ? eta -:--:--
    --------------------------------------- 0.1/10.9 MB 4.3 MB/s eta 0:00:03
   -- ------------------------------------- 0.7/10.9 MB 11.1 MB/s eta 0:00:01
   ------ --------------------------------- 1.8/10.9 MB 14.2 MB/s eta 0:00:01
   --------- ------------------------------ 2.5/10.9 MB 15.8 MB/s eta 0:00:01
   ------------ --------------------------- 3.5/10.9 MB 18.8 MB/s eta 0:00:01
   -------------- ------------------------- 4.1/10.9 MB 18.7 MB/s eta 0:00:01
   ------------------ --------------------- 5.1/10.9 MB 19.1 MB/s eta

In [27]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [28]:
# Function to generate synthetic data for vehicular networks
def generate_synthetic_data():
    np.random.seed(42)
    normal_traffic = np.column_stack((
        np.random.normal(0.5, 0.1, 1000),  # signal strength
        np.random.normal(60, 10, 1000),    # message frequency
        np.random.randint(1, 4, 1000),     # message types
        np.random.normal(0.5, 0.1, 1000)   # time intervals
    ))
    abnormal_traffic = np.column_stack((
        np.random.normal(0.2, 0.1, 50),    # signal strength
        np.random.normal(100, 20, 50),     # message frequency
        np.random.randint(5, 7, 50),       # message types
        np.random.normal(0.2, 0.1, 50)     # time intervals
    ))
    data = np.vstack([normal_traffic, abnormal_traffic])
    np.random.shuffle(data)
    return data

In [29]:
# Generate synthetic data
data = generate_synthetic_data()
print(data)

[[ 0.53686733 66.96954405  1.          0.47795138]
 [ 0.51818663 64.288165    2.          0.52708426]
 [ 0.50184184 67.9426468   1.          0.34322922]
 ...
 [ 0.70754008 52.52788321  2.          0.44001711]
 [ 0.64412733 70.91310121  3.          0.45512834]
 [ 0.51503018 61.9049968   3.          0.47628123]]


In [30]:
# Data preprocessing and feature engineering
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)
print(data_normalized)

[[ 0.42217624  0.32734753 -1.03476853 -0.06534699]
 [ 0.26474294  0.1275707  -0.12707684  0.34437174]
 [ 0.1269958   0.3998488  -1.03476853 -1.18879429]
 ...
 [ 1.86053598 -0.74863213 -0.12707684 -0.38168065]
 [ 1.32611939  0.62116328  0.78061485 -0.25566819]
 [ 0.23814169 -0.04998787  0.78061485 -0.07927443]]


In [31]:
# Applying PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Keeping 95% of variance
data_reduced = pca.fit_transform(data_normalized)

In [34]:
# Bayesian Gaussian Mixture Model
bgmm = BayesianGaussianMixture(n_components=10, covariance_type='full', weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.01, mean_precision_prior=0.8, random_state=42)
bgmm.fit(data_reduced)

In [35]:
# Predicting the cluster for each data point
labels = bgmm.predict(data_reduced)

In [37]:
# Predicting the clusters and probabilities
cluster_predictions = bgmm.predict(data_reduced)
probabilities = bgmm.predict_proba(data_reduced)
print(probabilities)

[[1.23153254e-058 8.62759292e-015 2.63905283e-007 ... 9.99998950e-001
  9.65170192e-051 3.59050644e-094]
 [9.99978530e-001 4.89449543e-012 5.39649284e-006 ... 5.21696740e-035
  1.97370548e-049 7.34233433e-093]
 [4.54550016e-059 1.65258460e-013 7.26005208e-009 ... 9.99999971e-001
  2.65500459e-052 9.87681879e-096]
 ...
 [9.99999964e-001 2.46596790e-014 9.03793396e-009 ... 2.12069112e-035
  3.30584755e-052 1.22980040e-095]
 [2.51938446e-058 2.00862804e-010 9.31368707e-009 ... 7.19474946e-138
  3.40718344e-052 1.26749813e-095]
 [2.15853559e-058 1.49178759e-009 1.23926028e-006 ... 7.53897772e-138
  4.53239479e-050 1.68608530e-093]]


In [38]:
# Calculating an anomaly score based on probabilities and distances
cluster_centers = bgmm.means_
distances = np.linalg.norm(data_reduced[:, np.newaxis] - cluster_centers, axis=2)
min_distances = np.min(distances, axis=1)
anomaly_scores = (1 - probabilities.max(axis=1)) * min_distances  # Combine distance and probability

In [39]:
# Defining outliers based on anomaly scores
threshold = np.percentile(anomaly_scores, 95)  # Top 5% are considered outliers
outliers = anomaly_scores > threshold

In [40]:
# Creating a DataFrame for visualization or further analysis
df = pd.DataFrame(data_reduced, columns=[f'PC{i+1}' for i in range(data_reduced.shape[1])])
df['Cluster'] = cluster_predictions
df['Anomaly_Score'] = anomaly_scores
df['Outlier'] = outliers

In [41]:
# Evaluate the clustering using silhouette score
score = silhouette_score(data_reduced, labels)
print(f'Silhouette Score: {score}')

Silhouette Score: 0.1471518748296942


In [42]:
print(df.head())

        PC1       PC2       PC3       PC4  Cluster  Anomaly_Score  Outlier
0 -0.552328 -0.297241  0.369951 -0.911111        7   5.953280e-07    False
1 -0.293739  0.108211  0.350861 -0.002901        0   7.546198e-06    False
2  0.160929 -0.966346 -0.128786 -1.297565        7   4.053016e-08    False
3  0.327718  0.622097  0.270614 -1.463164        7   5.300385e-08    False
4  0.274199 -0.191484 -0.964964  0.848928        3   3.763752e-07    False
