In [1]:
import pandas as pd 
import numpy as np

# Statistial Analysis 

Topics:

 - Normalization (Min-Max)
 - Mean, Median, Mode
 - Standard Deviation + ±3σ rule (outlier idea)
 - Standardization (Z-score)
 - IQR (Interquartile Range) outliers
 - Skewness, Kurtosis

In [2]:
# We'll create a feature vector with a few outliers so we can see:
# - mean vs median difference
# - ±3σ and IQR outlier detection
# - skewness/kurtosis behavior

x = np.concatenate([
    np.random.normal(loc=50, scale=10, size=95),  # "typical" values
    np.array([120, 130, 140, 5, 8])               # outliers (high + low)
])

print("Count:", x.size)
print("First 10:", x[:10])

Count: 100
First 10: [61.13686093 38.0447419  38.52285928 30.46957862 46.78273561 63.84693092
 41.66508746 55.56235582 52.80131591 49.50995079]


In [3]:
from IPython.display import HTML

HTML("""
<iframe width="560" height="315" src="https://www.youtube.com/embed/i373-Vc2d4o?si=g33RiXfjTcZHKD5d" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
""")


In [4]:
# Mean   = average; sensitive to outliers, so it doesn't give accurate analysis
# Median = middle value after sorting; robust to outliers
# Mode   = most frequent value; useful for categorical/discrete data

mean_x = np.mean(x)
median_x = np.median(x)

# Mode in pure NumPy (works best for discrete values)
vals, counts = np.unique(x, return_counts=True)
mode_x = vals[np.argmax(counts)]  # for continuous random data, mode often isn't meaningful

print("Mean  :", mean_x)
print("Median:", median_x)
print("Mode  :", mode_x, "(Note: mode is not very meaningful for continuous random data)")

Mean  : 51.578381392150874
Median: 50.71083036266927
Mode  : 5.0 (Note: mode is not very meaningful for continuous random data)


In [5]:
# Std dev (σ) measures typical spread around the mean.
# ±3σ rule (empirical rule): for roughly normal data, ~99.7% points lie within mean ± 3σ.
# So values outside mean ± 3σ are often treated as potential outliers.

std_x = np.std(x, ddof=0)  # population std (ddof=0). For sample std, use ddof=1.

lower_3s = mean_x - 3 * std_x
upper_3s = mean_x + 3 * std_x

outliers_3s = x[(x < lower_3s) | (x > upper_3s)]

print("Std dev:", std_x)
print("3σ range:", (lower_3s, upper_3s))
print("Outliers by ±3σ rule:", outliers_3s)
print("Outlier count:", outliers_3s.size)

Std dev: 17.659886684062034
3σ range: (-1.4012786600352314, 104.55804144433698)
Outliers by ±3σ rule: [120. 130. 140.]
Outlier count: 3


## Min–Max Normalization

$x_{\text{norm}} = \frac{x - x_{\min}}{x_{\max} - x_{\min}}$

## Standardization (Z-score)

$z = \frac{x - \mu}{\sigma}$

where $\mu$ is the mean and $\sigma$ is the standard deviation.

## Cons of Min–Max and how Standardization helps

**Cons of Min–Max**

Very sensitive to outliers: if one value is extremely large/small, $x_{\max}$ or $x_{\min}$ becomes extreme, and most normal values get squeezed into a tiny range near 0 or 1.

Range depends on the dataset: when new data arrives with a new max/min, the scaling changes, so the same value can map differently across datasets.

**How Standardization helps**

Less dominated by extreme min/max: it scales using $\mu$ and $\sigma$, so a single outlier usually affects scaling less than directly setting the entire range.

Produces comparable feature scales: features become centered at 0 with unit variance, which often helps gradient-based models (e.g., logistic regression, SVM, neural nets) learn faster and more stably.

In [6]:
# Normalization (Min-Max): rescales data into [0, 1]
# Formula: x_norm = (x - min(x)) / (max(x) - min(x))
# Useful when features have different units/ranges and you want them comparable.
# Note: very sensitive to outliers because min/max can be extreme.

x_min, x_max = x.min(), x.max()
x_norm = (x - x_min) / (x_max - x_min)

print("Min, Max:", x_min, x_max)
print("Normalized range:", (x_norm.min(), x_norm.max()))
print("First 5 normalized:", x_norm[:5])

Min, Max: 5.0 140.0
Normalized range: (0.0, 1.0)
First 5 normalized: [0.4158286  0.24477587 0.24831748 0.18866355 0.30950175]


In [7]:
# Standardization: makes data have mean ~0 and std ~1
# Formula: z = (x - mean) / std
# Helpful for models sensitive to scale (e.g., gradient descent, SVM, KNN, PCA).

x_z = (x - mean_x) / (std_x + 1e-8)  # epsilon avoids divide-by-zero

print("Standardized mean ~", x_z.mean())
print("Standardized std  ~", x_z.std())
print("First 5 standardized:", x_z[:5])

Standardized mean ~ 4.973799150320702e-16
Standardized std  ~ 0.9999999994337448
First 5 standardized: [ 0.54125373 -0.76634917 -0.73927553 -1.19529661 -0.27155586]


In [8]:
# Quartiles:
# Q1 = 25th percentile
# Q2 = 50th percentile (median)
# Q3 = 75th percentile
#
# IQR = Q3 - Q1 (spread of middle 50% of data)
#-----------q1--------/--------q2-------/---------q3-------/----------q4--------/
# IQR outlier rule (common robust rule):
# lower = Q1 - 1.5*IQR
# upper = Q3 + 1.5*IQR
# Values outside are potential outliers (more robust than ±3σ for skewed data).

Q1 = np.percentile(x, 25)
Q3 = np.percentile(x, 75)
IQR = Q3 - Q1

lower_iqr = Q1 - 1.5 * IQR
upper_iqr = Q3 + 1.5 * IQR

outliers_iqr = x[(x < lower_iqr) | (x > upper_iqr)]

print("Q1, Median, Q3:", Q1, median_x, Q3)
print("IQR:", IQR)
print("IQR range:", (lower_iqr, upper_iqr))
print("Outliers by IQR:", outliers_iqr)
print("Outlier count:", outliers_iqr.size)

Q1, Median, Q3: 43.647444338938 50.71083036266927 56.82280165879751
IQR: 13.175357319859508
IQR range: (23.88440835914874, 76.58583763858677)
Outliers by IQR: [ 17.08180503 120.         130.         140.           5.
   8.        ]
Outlier count: 6


In [9]:
# When to use what (quick rules):
# - Mean/Std: good for roughly normal numeric features (but sensitive to outliers)
# - Median/IQR: robust summaries when outliers/skew exist
# - Min-Max normalization: when you want [0,1] scaling (but outlier-sensitive)
# - Z-score standardization: common default for many ML algorithms
# - ±3σ rule: simple outlier rule for near-normal data
# - IQR rule: robust outlier rule for skewed/non-normal data

summary = {
    "mean": mean_x,
    "median": median_x,
    "std": std_x,
    "min": x_min,
    "max": x_max,
    "Q1": Q1,
    "Q3": Q3,
    "IQR": IQR,
    "outliers_3sigma_count": int(outliers_3s.size),
    "outliers_iqr_count": int(outliers_iqr.size),
}
pd.Series(summary)

mean                      51.578381
median                    50.710830
std                       17.659887
min                        5.000000
max                      140.000000
Q1                        43.647444
Q3                        56.822802
IQR                       13.175357
outliers_3sigma_count      3.000000
outliers_iqr_count         6.000000
dtype: float64