In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# load dataset
df = pd.read_csv("Sleep_Efficiency.csv")

# explore basic info
df.head()
df.info()
df.shape
df.describe(exclude=object)
df.dtypes
df.isnull().sum()
df.duplicated().sum()

# handle missing values
df.isnull().sum().sort_values(ascending=False)
mode_caff = df["caffeine_consumption"].mode(dropna=True)[0]
df["caffeine_consumption"] = df["caffeine_consumption"].fillna(mode_caff)
mode_alc = df["alcohol_consumption"].mode(dropna=True)[0]
df["alcohol_consumption"] = df["alcohol_consumption"].fillna(mode_alc)
median_awake = df["awakenings"].median()
df["awakenings"] = df["awakenings"].fillna(median_awake)
median_exfreq = df["exercise_frequency"].median()
df["exercise_frequency"] = df["exercise_frequency"].fillna(median_exfreq)
df.isnull().sum()

# define numeric and continuous columns
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
continuous_cols = [col for col in numeric_cols if df[col].nunique() > 2]

# detect outliers
for col in continuous_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outliers = df[(df[col] < lower) | (df[col] > upper)]

# create summary and visualize distributions
outlier_summary = []
for col in continuous_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outlier_count = df[(df[col] < lower) | (df[col] > upper)].shape[0]
    outlier_summary.append({
        "Column": col,
        "Q1": round(q1, 2),
        "Q3": round(q3, 2),
        "IQR": round(iqr, 2),
        "Lower Bound": round(lower, 2),
        "Upper Bound": round(upper, 2),
        "Outlier Count": outlier_count
    })
    plt.figure(figsize=(6,3))
    sns.boxplot(x=df[col], color='skyblue')
    plt.title(f"Boxplot for {col}")
    plt.show()
    plt.figure(figsize=(6,3))
    sns.histplot(df[col], bins=30, kde=True, color='lightcoral')
    plt.title(f"Histogram for {col}")
    plt.show()

# display outlier summary
outlier_summary_df = pd.DataFrame(outlier_summary)
display(outlier_summary_df.sort_values("Outlier Count", ascending=False))
