In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files

df = pd.read_csv('processed_encoding.csv')
df.head(),df.shape,df.describe()

numeric_cols = df.select_dtypes(include=[np.number]).columns

def treat_outliers_iqr(data, cols):
    df_clean = data.copy()
    outlier_summary = {}

    for col in cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR


        outliers = ((df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)).sum()
        outlier_summary[col] = int(outliers)


        df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound,
                          np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col]))

        if col in ["daily_screen_time_hours", "caffeine_intake_mg_per_day", "weekly_anxiety_score"]:
            fig, axes = plt.subplots(1, 2, figsize=(10, 4))
            fig.suptitle(f"Outlier Treatment: {col}", fontsize=14)

            axes[0].boxplot(data[col], vert=True)
            axes[0].set_title("Before Treatment")
            axes[0].set_ylabel(col)

            axes[1].boxplot(df_clean[col], vert=True)
            axes[1].set_title("After Treatment")

            plt.show()


    return df_clean, outlier_summary

df_clean, outlier_summary = treat_outliers_iqr(df, numeric_cols)

print("Outliers detected per column:")
print(outlier_summary)

print("\nBefore Treatment:\n", df[numeric_cols].describe())
print("\nAfter Treatment:\n", df_clean[numeric_cols].describe())

df_clean.to_csv("processed_clean.csv", index=False)
files.download("processed_clean.csv")