### 1. Basic Statistics

In [1]:
# Setup
from typing import List, Tuple
from collections import Counter
import numpy as np

In [2]:
# Some data
num_friends = [100.0,49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]

In [3]:
# Number of data points
num_points = len(num_friends)
# Largest data point
largest_value = max(num_friends)
# Smallest data point
smallest_value = min(num_friends)
# second_smallest_value
sorted_values = sorted(num_friends)
second_largest_value = sorted_values[-2]
# Results
print(num_points, largest_value, smallest_value, second_largest_value)

204 100.0 1 49


### 2. Central Tendencies
- Mean, Median, Mode
- Quantile

In [4]:
# Mean
def mean(xs: List[float]) -> float:
    return sum(xs) / len(xs)
mean(num_friends)

7.333333333333333

In [5]:
# Median
def _median_odd(xs: List[float]) -> float: # Private function 1
    "If len(xs) is odd, the median is the middle value"
    return sorted(xs)[len(xs) // 2]

def _median_even(xs: List[float]) -> float: # Private function 2
    "Find the middle most value"
    sorted_xs = sorted(xs)
    middle_point = len(xs) // 2
    return (sorted_xs[middle_point -1] + sorted_xs[middle_point]) / 2

def median(v: List[float]) -> float:
    return _median_even(v) if len(v) % 2 == 0 else _median_odd(v)

median(num_friends)

6.0

In [6]:
# Mode
def mode(x: List[float]) -> List[float]:
    counts = Counter(x)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items() if count == max_count]

In [7]:
mode(num_friends)

[6, 1]

In [8]:
# Quantile
def quantile(xs: List[float], p: float) -> float:
    p_index = int(p * len(xs))
    return sorted(xs)[p_index]
# Using our function
quantile(num_friends, 0.75) # 75th of the data is below the value 9

9

## 3. Dispersion
Measure of how spread out our data is.
- Range
- Variance
- Standard Deviation
- Interquartile Range

### 3.1 Range
Difference between largest and smallest value.

In [9]:
def data_range(xs: List[float]) -> float:
    return max(xs) - min(xs)
# Use this
data_range(num_friends)

99.0

### 3.2 Variance
Population Variance:
$\sigma^2 = \frac{1}{N} \sum_{i=1}^{N} (x_i - \mu)^2$

- $x_i$ = each data point
- $\mu$ = population mean
- $N$ = number of data points

Sample Variance:
$s^2 = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})^2$

- $x_i$ = each sample point
- $\bar{x}$ = sample mean
- $n$ = sample size -> *n-1*

### 3.3 Standard Deviation
Square root of the variance.

In [10]:
### 3.4 Interquartile range
def interquartile_range(xs: List[float]) -> float:
    return quantile(xs, 0.75) - quantile(xs, 0.25)
# Use it
interquartile_range(num_friends)

6

## 4. Correlation
Measures the linear relationship between two metrics.

### 4.1 Covariance
Measures how two variables cary in tandem from their means.

Population covariance:
$\mathrm{Cov}(X, Y) = \frac{1}{N} \sum_{i=1}^{N} (x_i - \mu_X)(y_i - \mu_Y)$

- $x_i, y_i$ = paired data points
- $\mu_X, \mu_Y$ = means of X and Y
- $N$ = number of data pairs

Sample covariance:
$\mathrm{Cov}(X, Y) = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})$
- $n$ = sample size
- $\bar{x}, \bar{y}$ = sample means

In [11]:
# Sample covariance
def covariance(xs: List[float], ys: List[float]) -> float:
    assert len(xs) == len(ys),"xs and ys must have same number of elements"
    x_bar = np.mean(xs)
    y_bar = np.mean(ys)
    return sum((x - x_bar) * (y - y_bar) for x,y in zip(xs,ys)) / (len(xs) - 1)
# Use it
print(covariance(num_friends, daily_minutes))

22.425435139573057


### 4.2 Correlation
Correlation is unitless and always lies between -1 (perfect anticorrelation) and 1 (perfect correlation). Extremely sensitive to extreme values.

$r_{X,Y} \;=\; \frac{\mathrm{Cov}(X,Y)}{\sigma_X \, \sigma_Y}$
- $\mathrm{Cov}(X,Y)$ = covariance of X and Y
- $\sigma_X, \sigma_Y$ = standard deviations of X and Y

In [12]:
def correlation(xs: List[float], ys: List[float]) -> float:
    "Measures how much xs and ys vary in tandem about their means"
    stdev_x = np.std(xs)
    stdev_y = np.std(ys)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(xs, ys) / stdev_x / stdev_y
    else:
        return 0 #if no variation, correlation is zero
    
# Use it
print(np.round(correlation(num_friends, daily_minutes),2)) # Moderate positive correlation

0.25


### 4.3 Simpson's Paradox
Correlations can be misleading when confounding variables are ignored. E.g. overall dataset has a positive correlation but groupwise correlation is showing the opposite. Groupwise analysis might be better - depending on the research question.