In [None]:
pip install pycaret[full]

In [None]:
import pycaret
pycaret.__version__

In [None]:
from pycaret.datasets import get_data
data = get_data('anomaly')

In [None]:
from pycaret.anomaly import *
s = setup(data, session_id = 123)

In [None]:
from pycaret.anomaly import AnomalyExperiment
exp = AnomalyExperiment()

In [None]:
type(exp)

In [None]:
exp.setup(data, session_id = 123)

# **Create Model**

In [None]:
model_test = create_model('sod')
model_test


In [None]:
models()

In [None]:
model_anomalies = assign_model(model_test)
model_anomalies

In [None]:
plot_model(model_test, plot = 'tsne')

In [None]:
model_pred = predict_model(model_test, data=data)
model_pred

In [None]:
import numpy as np
import pandas as pd

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

new = model_pred
non_anomalous_data = new[new['Anomaly'] == 0].drop(['Anomaly', 'Anomaly_Score'], axis=1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(non_anomalous_data)

best_score = -1
best_k = None
best_kmeans = None

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_labels = kmeans.fit_predict(scaled_data)
    score = silhouette_score(scaled_data, cluster_labels)
    print(f"Silhouette Score for {k} clusters: {score}")

    if score > best_score:
        best_score = score
        best_k = k
        best_kmeans = kmeans

print(f"Best number of clusters: {best_k}")

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_data)

from sklearn.cluster import KMeans

best_kmeans = KMeans(n_clusters=best_k, random_state=42).fit(reduced_data)

plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=best_kmeans.labels_, cmap='viridis')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Clusters Visualized with PCA')
plt.colorbar(label='Cluster')
plt.show()

print("Explained variance ratio:", pca.explained_variance_ratio_)

pca_components = pd.DataFrame(pca.components_, columns=non_anomalous_data.columns, index=['PC1', 'PC2'])
print("Feature contribution to each principal component:\n", pca_components)


In [None]:
import pandas as pd

contributions = {col: pca_components.loc['PC1', col] for col in pca_components.columns}

contributions_df = pd.DataFrame(list(contributions.items()), columns=["Feature", "Contribution"])
contributions_df["Absolute_Contribution"] = contributions_df["Contribution"].abs()
contributions_df = contributions_df.sort_values(by="Absolute_Contribution", ascending=False)

best_features = contributions_df.head(4)
print("Best 4 parameters for iforest based on PCA contributions:")
print(best_features)



In [None]:
data = model_pred

pca_components.loc['Absolute_Contribution'] = pca_components.loc['PC1'].abs()
sorted_features = pca_components.loc['Absolute_Contribution'].sort_values(ascending=False)
selected_features = sorted_features.index[:4].tolist()

X_selected = data[selected_features]
model_test.fit(X_selected)
anomaly_labels = model_test.predict(X_selected)

anomaly_count = np.sum(anomaly_labels == 1)
normal_count = np.sum(anomaly_labels == 0)

print(f"Anomaly count: {anomaly_count}")
print(f"Normal count: {normal_count}")


In [None]:
save_model(model_test, 'model_pipeline')

In [None]:
loaded_model_pipeline = load_model('model_pipeline')
loaded_model_pipeline