In [1]:
import pandas as pd

# === Load your dataset ===
df = pd.read_csv("../Final/final_clean_dataset.csv")

# === List of numeric columns to check for outliers ===
numeric_cols = [
    "HDI",
    "Suicide_rate",
    "GDP_per_capita",
    "HDI_growth",
    "Suicide_change",
    "Suicide_per_HDI"
]

# === Function to remove outliers using IQR method ===
def remove_outliers_iqr(data, cols):
    df_clean = data.copy()
    for col in cols:
        if col in df_clean.columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Filter out rows outside IQR range
            before = len(df_clean)
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
            after = len(df_clean)
            
            print(f"Removed {before - after} outliers from '{col}'")
    return df_clean

# === Apply the function ===
df_cleaned = remove_outliers_iqr(df, numeric_cols)

# === Save cleaned dataset ===
df_cleaned.to_csv("final_dataset_cleaned_no_outlier.csv", index=False)

print("\n✅ Cleaned dataset saved as 'final_dataset_cleaned_no_outlier.csv'")
print(f"Original rows: {len(df)} | After cleaning: {len(df_cleaned)}")


Removed 0 outliers from 'HDI'
Removed 26 outliers from 'Suicide_rate'
Removed 125 outliers from 'GDP_per_capita'
Removed 56 outliers from 'HDI_growth'
Removed 167 outliers from 'Suicide_change'
Removed 12 outliers from 'Suicide_per_HDI'

✅ Cleaned dataset saved as 'final_dataset_cleaned_no_outlier.csv'
Original rows: 985 | After cleaning: 599


In [2]:
import pandas as pd
import numpy as np

# === Load your dataset ===
df = pd.read_csv("../Final/final_clean_dataset.csv")

# === Columns to check for extreme outliers ===
cols_to_check = ["Suicide_rate", "HDI", "GDP_per_capita"]

def modified_z_score(series):
    """Compute modified Z-score for extreme outlier detection."""
    median = series.median()
    mad = np.median(np.abs(series - median))
    if mad == 0:
        return np.zeros_like(series)
    return 0.6745 * (series - median) / mad

def drop_extreme_outliers(df, cols, threshold=5.0):
    """Remove only rows that are extreme outliers in any selected column."""
    df_clean = df.copy()
    mask = pd.Series([False] * len(df_clean), index=df_clean.index)
    
    for col in cols:
        if col in df_clean.columns:
            z_scores = modified_z_score(df_clean[col])
            extreme_mask = np.abs(z_scores) > threshold
            print(f"Extreme outliers in '{col}': {extreme_mask.sum()}")
            mask = mask | extreme_mask  # combine all extreme flags
    
    df_filtered = df_clean[~mask]
    print(f"\n✅ Removed {mask.sum()} extreme outlier rows total.")
    return df_filtered

# === Apply filter ===
df_cleaned = drop_extreme_outliers(df, cols_to_check, threshold=5.0)

# === Save cleaned dataset ===
df_cleaned.to_csv("final_dataset_cleaned_extreme_only.csv", index=False)

print(f"\nOriginal rows: {len(df)} | After cleaning: {len(df_cleaned)}")
print("Saved as 'final_dataset_cleaned_extreme_only.csv'")


Extreme outliers in 'Suicide_rate': 5
Extreme outliers in 'HDI': 0
Extreme outliers in 'GDP_per_capita': 115

✅ Removed 120 extreme outlier rows total.

Original rows: 985 | After cleaning: 865
Saved as 'final_dataset_cleaned_extreme_only.csv'
