In [1]:
!pip install imbalanced-learn



In [2]:
pip install shap imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack
import shap
import os

# Load dataset
df = pd.read_csv("cleaned_data/final/BERT/all_merged_labeled_contextual.csv")

# Drop NA values
target = 'emotional_impact_ML'
text_col = 'assault_desc'
struct_features = ['victim_profession', 'department', 'perpetrator_type', 'violence_type', 'emotional_impact', 'response_action']
df = df.dropna(subset=struct_features + [target, text_col])

# Encode target
y = LabelEncoder().fit_transform(df[target])

# Feature engineering
X_struct = pd.get_dummies(df[struct_features])
tfidf = TfidfVectorizer(max_features=300, stop_words='english')
X_text = tfidf.fit_transform(df[text_col])

# Combine features
X_combined = hstack([X_struct, X_text])
X_combined = StandardScaler(with_mean=False).fit_transform(X_combined)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Model training
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluation and save confusion matrices
labels = sorted(df[target].unique())
os.makedirs("cleaned_data/final/imgs", exist_ok=True)

model_scores = {}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)

    report = classification_report(y_test, y_pred, target_names=labels, output_dict=True)
    model_scores[name] = report["weighted avg"]

    print(f"\n{name} - Classification Report")
    print(classification_report(y_test, y_pred, target_names=labels))

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues")
    plt.title(f"{name} - emotional_impact_ML Classification")
    plt.savefig(f"cleaned_data/final/imgs/{name.replace(' ', '_').lower()}_cm.png", bbox_inches='tight')
    plt.close()

# Feature Importance (Random Forest only)
if "Random Forest" in models:
    rf_model = models["Random Forest"]
    feature_names = list(X_struct.columns) + list(tfidf.get_feature_names_out())
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[-10:][::-1]  # top 10

    plt.figure(figsize=(10, 6))
    sns.barplot(x=importances[indices], y=np.array(feature_names)[indices])
    plt.title("Top 10 Feature Importances (Random Forest)")
    plt.tight_layout()
    plt.savefig("cleaned_data/final/imgs/rf_feature_importance.png")
    plt.close()

# SHAP Explanation (Random Forest)
X_train_dense = X_train_res.toarray()
X_test_dense = X_test[:100].toarray()
explainer = shap.Explainer(rf_model, X_train_dense)
shap_values = explainer(X_test_dense)

shap.summary_plot(shap_values, X_test_dense, feature_names=feature_names, max_display=10, show=False)
plt.title("SHAP Summary - Random Forest")
plt.savefig("cleaned_data/final/imgs/shap_summary.png", bbox_inches='tight')
plt.close()

# Compare models summary
print("\n\n📊 Model Comparison Summary (Weighted F1-score, Precision, Recall):")
for model, metrics in model_scores.items():
    print(f"{model}: F1={metrics['f1-score']:.2f}, Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}")

# KMeans Clustering + PCA
cluster_cols = ['victim_profession', 'department', 'violence_type']
cluster_df = df.dropna(subset=cluster_cols)
X_cluster = pd.get_dummies(cluster_df[cluster_cols])

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_cluster)

pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
pca_df["Cluster"] = clusters

plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Cluster", palette="Set2")
plt.title("KMeans Clustering of Roles & Violence Types (PCA 2D)")
plt.tight_layout()
plt.savefig("cleaned_data/final/imgs/kmeans_pca_roles.png")
plt.close()

# Cluster summaries
cluster_df['Cluster'] = clusters
for col in cluster_cols:
    print(f"\nTop 3 '{col}' per Cluster:")
    print(cluster_df.groupby('Cluster')[col].value_counts().groupby(level=0).nlargest(3))







Logistic Regression - Classification Report
              precision    recall  f1-score   support

        Mild       0.53      0.39      0.45        23
    Moderate       0.81      0.77      0.79       155
      Severe       0.43      0.53      0.47        51

    accuracy                           0.68       229
   macro avg       0.59      0.56      0.57       229
weighted avg       0.69      0.68      0.69       229


Random Forest - Classification Report
              precision    recall  f1-score   support

        Mild       0.71      0.52      0.60        23
    Moderate       0.88      0.81      0.84       155
      Severe       0.47      0.65      0.55        51

    accuracy                           0.74       229
   macro avg       0.69      0.66      0.66       229
weighted avg       0.77      0.74      0.75       229


SVM - Classification Report
              precision    recall  f1-score   support

        Mild       0.78      0.30      0.44        23
    Moderate    