In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [None]:
data_df = pd.read_csv('../data/cleaned/South_East_Asia_Social_Media_MentalHealth_cleaned.csv')

In [None]:
selected_columns = ['likes_received', 'comments_received', 'shares_received']
data_selected = data_df[selected_columns]

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_selected)

In [None]:
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # Contamination is the expected fraction of anomalies
iso_forest.fit(data_scaled)

# Predict anomalies
data_df['anomaly_score'] = iso_forest.decision_function(data_scaled)  # Scores: negative values are more anomalous
data_df['is_anomaly'] = iso_forest.predict(data_scaled)

In [None]:
anomalies = data_df[data_df['is_anomaly'] == -1]
normal = data_df[data_df['is_anomaly'] == 1]
print(f"Total anomalies detected: {len(anomalies)}")
print(f"Percentage of anomalies: {len(anomalies) / len(data_df) * 100:.2f}%")

# View a few anomalies
anomalies.head()

## Exploration

In [None]:
# Analyzing specific anomalies
sample_anomaly = anomalies[selected_columns].describe()
print("Summary statistics for anomalies:")
print(sample_anomaly)

# Compare with normal samples
sample_normal = normal[selected_columns].describe()
print("\nSummary statistics for normal data:")
print(sample_normal)

## Extra Credit
#  Local Outlier Factor (LOF) algorithm

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Initialize the LOF model
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
data_df['lof_anomaly'] = lof.fit_predict(data_scaled)  # -1 indicates anomaly

# Compare LOF anomalies with Isolation Forest
lof_anomalies = data_df[data_df['lof_anomaly'] == -1]
print(f"Total anomalies detected by LOF: {len(lof_anomalies)}")