# Task 1 – Exploratory Data Analysis (EDA)
This notebook performs the **minimum-essential** EDA requested for Week 3 Task 1:

* Descriptive statistics & missing-value check
* Univariate histograms for numeric features
* Correlation heat-map
* Top-10 categories bar-chart (categorical feature)


In [None]:
# --- Imports & settings ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style("whitegrid")
pd.set_option("display.max_columns", 50)

DATA_PATH = Path("../data/MachineLearningRating_v3.txt")  # adjust as needed
SAMPLE = 50_000  # set to None for full dataset
SEED = 42


: 

In [None]:
# --- Load dataset (auto-detect delimiter) ---
def read_dataset(path: Path) -> pd.DataFrame:
    for sep in ("|", ",", "	"):
        try:
            tmp = pd.read_csv(path, sep=sep, low_memory=False)
            if tmp.shape[1] > 1:  # good split if >1 col
                return tmp
        except Exception:
            pass
    raise ValueError(f"Unable to read {path}. Check delimiter.")

df = read_dataset(DATA_PATH)
print("Loaded shape:", df.shape)

if SAMPLE is not None and SAMPLE < len(df):
    df = df.sample(n=SAMPLE, random_state=SEED)
    print("Sampled rows:", df.shape)


In [None]:
# --- Descriptive statistics & missing-value report ---
numeric_df = df.select_dtypes(include="number")
display(numeric_df.describe().T)

missing = df.isna().sum().sort_values(ascending=False)
display(missing[missing > 0].to_frame("missing"))


In [None]:
# --- Histogram grid (first 12 numeric cols) ---
cols = numeric_df.columns[:12]
if not cols.empty:
    numeric_df[cols].hist(bins=30, figsize=(15, 10))
    plt.suptitle("Distribution of Numeric Features", fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()


In [None]:
# --- Correlation heat-map ---
if numeric_df.shape[1] > 1:
    corr = numeric_df.corr(numeric_only=True)
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, cmap="coolwarm", center=0, square=True, linewidths=0.5)
    plt.title("Pearson Correlation (numeric features)")
    plt.show()


In [None]:
# --- Top-10 categories bar chart ---
cat_cols = [c for c in df.select_dtypes(include="object").columns if df[c].nunique() <= 30]
if cat_cols:
    top_col = cat_cols[0]
    top_counts = df[top_col].value_counts().head(10)
    sns.barplot(x=top_counts.values, y=top_counts.index, palette="viridis")
    plt.xlabel("Frequency")
    plt.ylabel(top_col)
    plt.title(f"Top-10 categories for {top_col}")
    plt.show()
