In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.inspection import permutation_importance

# Paths
base_dir = "C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis"
data_path = f"{base_dir}/data/raw/risk_factors_cervical_cancer.csv"
figures_dir = f"{base_dir}/results/figures"
tables_dir = f"{base_dir}/results/tables"

os.makedirs(figures_dir, exist_ok=True)
os.makedirs(tables_dir, exist_ok=True)

# Load dataset
df = pd.read_csv(data_path)

# Preprocess
df.replace('?', np.nan, inplace=True)

for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')

for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Features and target
X = df.drop(columns=['Biopsy'])
y = df['Biopsy']

if y.dtype == 'O':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train best model (Random Forest)
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# --- Permutation Importance ---
perm_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)

# Create DataFrame
importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance_Mean": perm_importance.importances_mean,
    "Importance_Std": perm_importance.importances_std
}).sort_values(by="Importance_Mean", ascending=False)

# Save importance table
importance_df.to_csv(f"{tables_dir}/permutation_feature_importance.csv", index=False)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x="Importance_Mean", y="Feature", data=importance_df, palette="viridis")
plt.title("Feature Importance (Permutation Importance)")
plt.xlabel("Mean Importance Score")
plt.ylabel("Feature")
plt.savefig(f"{figures_dir}/permutation_feature_importance.png", dpi=300, bbox_inches="tight")
plt.close()

print("Step 6 completed: Permutation importance results saved.")
print(f"- Table: {tables_dir}/permutation_feature_importance.csv")
print(f"- Plot:  {figures_dir}/permutation_feature_importance.png")


  df[col] = pd.to_numeric(df[col], errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because th

Step 6 completed: Permutation importance results saved.
- Table: C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis/results/tables/permutation_feature_importance.csv
- Plot:  C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis/results/figures/permutation_feature_importance.png
