In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Paths
base_dir = "C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis"
data_path = f"{base_dir}/data/raw/risk_factors_cervical_cancer.csv"
figures_dir = f"{base_dir}/results/figures"
tables_dir = f"{base_dir}/results/tables"

# Load dataset
df = pd.read_csv(data_path)

# 1. Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# 2. Convert columns to numeric where possible
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')

# 3. Fill missing values
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# 4. Separate features and target
X = df.drop(columns=['Biopsy'])
y = df['Biopsy']

# Encode target if needed
if y.dtype == 'O':
    le = LabelEncoder()
    y = le.fit_transform(y)

# --- Method 1: Mutual Information ---
mi_scores = mutual_info_classif(X, y, discrete_features='auto', random_state=42)
mi_df = pd.DataFrame({"Feature": X.columns, "Mutual_Information": mi_scores})
mi_df = mi_df.sort_values(by="Mutual_Information", ascending=False)

# Save MI results
mi_df.to_csv(f"{tables_dir}/mutual_information_scores.csv", index=False)

# Plot MI
plt.figure(figsize=(10,6))
sns.barplot(x="Mutual_Information", y="Feature", data=mi_df, palette="viridis")
plt.title("Feature Importance (Mutual Information)")
plt.savefig(f"{figures_dir}/feature_importance_mutual_info.png", dpi=300, bbox_inches="tight")
plt.close()

# --- Method 2: Random Forest Importance ---
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y)
rf_importances = rf.feature_importances_
rf_df = pd.DataFrame({"Feature": X.columns, "Random_Forest_Importance": rf_importances})
rf_df = rf_df.sort_values(by="Random_Forest_Importance", ascending=False)

# Save RF results
rf_df.to_csv(f"{tables_dir}/random_forest_importance.csv", index=False)

# Plot RF
plt.figure(figsize=(10,6))
sns.barplot(x="Random_Forest_Importance", y="Feature", data=rf_df, palette="mako")
plt.title("Feature Importance (Random Forest)")
plt.savefig(f"{figures_dir}/feature_importance_random_forest.png", dpi=300, bbox_inches="tight")
plt.close()

print("Feature selection results saved in:")
print(f"- {tables_dir}")
print(f"- {figures_dir}")


  df[col] = pd.to_numeric(df[col], errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because th

Feature selection results saved in:
- C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis/results/tables
- C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis/results/figures
