# Assignment 9.2

> Replace all TODOs with your code. Do not change any other code.

In [None]:
# Do not edit this cell

from typing import List

## Descriptive statistics

In this assignment, we will write the functions to calculate the basic statistics from scratch, not using numpy.

### Task 1

Let's start simple: write a function `mean` that calculates the average of the list.

$$\mu = \frac{{\sum_{i=1}^n x_i}}{{n}}$$

In [1]:
from typing import List

def mean(li: List[float]) -> float:
    if not li:
        raise ValueError("List cannot be empty")
    return sum(li) / len(li)

assert mean([1., 2., 3.]) == 2.
assert mean([1., 1., 2., 0.]) == 1.


### Task 2

Now let's calculate variance (dispersion). You may use the `mean` function implemented before.

$$V = \frac{{\sum_{i=1}^n (x_i - \mu)^2}}{{n}}$$

In [2]:
from typing import List

def mean(li: List[float]) -> float:
    if not li:
        raise ValueError("List cannot be empty")
    return sum(li) / len(li)

def variance(li: List[float]) -> float:
    if not li:
        raise ValueError("List cannot be empty")
    mu = mean(li)
    return sum((x - mu) ** 2 for x in li) / len(li)

assert variance([1., 1., 1.]) == 0.
assert variance([1., 2., 3., 4.]) == 1.25


### Task 3

The standard deviation is easy once you get the variance:

$$\sigma = \sqrt{V}$$

In [3]:
from typing import List
import math

def mean(li: List[float]) -> float:
    if not li:
        raise ValueError("List cannot be empty")
    return sum(li) / len(li)

def variance(li: List[float]) -> float:
    if not li:
        raise ValueError("List cannot be empty")
    mu = mean(li)
    return sum((x - mu) ** 2 for x in li) / len(li)

def std(li: List[float]) -> float:
    return math.sqrt(variance(li))

assert std([1., 1., 1.]) == 0.
assert std([1., 2., 3., 4.]) == math.sqrt(1.25)


### Task 4

**Median**

The median is the middle value in a sorted dataset. If the dataset has an odd number of values, the median is the value at the center. If the dataset has an even number of values, the median is the average of the two middle values.

In [4]:
from typing import List

def median(li: List[float]) -> float:
    sorted_li = sorted(li)
    n = len(sorted_li)
    if n % 2 == 0:
        mid = n // 2
        return (sorted_li[mid - 1] + sorted_li[mid]) / 2
    else:
        return sorted_li[n // 2]

assert median([1., 1., 1.]) == 1.
assert median([1., 4., 3., 2.]) == 2.5


## Measure performance

Sometimes, apart from theoretical, algorithmic complexity, it's a good idea to compare the runtime of two algorithms empirically, i.e., run the code many times and time it.

In Python's standard library, we have [timeit](https://docs.python.org/3/library/timeit.html) module that does exactly that.

Let's compare the runtime of your implementations and numpy. Use the provided setup code:

In [5]:
import timeit
import numpy as np
import random

def mean(li):
    return sum(li) / len(li)

def variance(li):
    mu = mean(li)
    return sum((x - mu) ** 2 for x in li) / len(li)

def std(li):
    return variance(li) ** 0.5

def median(li):
    sorted_li = sorted(li)
    n = len(sorted_li)
    if n % 2 == 0:
        mid = n // 2
        return (sorted_li[mid - 1] + sorted_li[mid]) / 2
    else:
        return sorted_li[n // 2]

setup = '''
import random
import numpy as np
arr = np.random.rand(10_000) * 100
li = [random.random() * 100 for _ in range(10_000)]
'''

funcs = {
    'mean': mean,
    'variance': variance,
    'std': std,
    'median': median,
    'np_mean': np.mean,
    'np_var': np.var,
    'np_std': np.std,
    'np_median': np.median
}

### Task 5

Complete Python statements to compare your functions to numpy. Use `li` for your function and `arr` for numpy functions.

In [None]:
stmt_mean_custom = 'mean(li)'
stmt_mean_np = 'np.mean(arr)'

stmt_var_custom = 'variance(li)'
stmt_var_np = 'np.var(arr)'
stmt_std_custom = 'std(li)'
stmt_std_np = 'np.std(arr)'
stmt_median_custom = 'median(li)'
stmt_median_np = 'np.median(arr)'


### Task 6

Measure average exec time of your statements with `timeit` module. As your submission, fill out the table with results (rounded to 2 decimal places)

In [6]:
import timeit

# Setup code including definitions of custom functions
setup = '''
import random
import numpy as np
arr = np.random.rand(10_000) * 100
li = [random.random() * 100 for _ in range(10_000)]

def mean(li):
    return sum(li) / len(li)

def variance(li):
    mu = mean(li)
    return sum((x - mu) ** 2 for x in li) / len(li)

def std(li):
    return variance(li) ** 0.5

def median(li):
    sorted_li = sorted(li)
    n = len(sorted_li)
    if n % 2 == 0:
        mid = n // 2
        return (sorted_li[mid - 1] + sorted_li[mid]) / 2
    else:
        return sorted_li[n // 2]
'''

# Function to measure execution time and calculate average time
def measure_time(stmt, setup, number=10_000):
    total_time = timeit.timeit(stmt=stmt, setup=setup, number=number)
    avg_time = total_time / number
    return avg_time

# Statements for custom and numpy functions
stmt_mean_custom = 'mean(li)'
stmt_mean_np = 'np.mean(arr)'
stmt_var_custom = 'variance(li)'
stmt_var_np = 'np.var(arr)'
stmt_std_custom = 'std(li)'
stmt_std_np = 'np.std(arr)'
stmt_median_custom = 'median(li)'
stmt_median_np = 'np.median(arr)'

# Measure execution time for each function
mean_custom_time = measure_time(stmt=stmt_mean_custom, setup=setup)
var_custom_time = measure_time(stmt=stmt_var_custom, setup=setup)
std_custom_time = measure_time(stmt=stmt_std_custom, setup=setup)
median_custom_time = measure_time(stmt=stmt_median_custom, setup=setup)

mean_np_time = measure_time(stmt=stmt_mean_np, setup=setup)
var_np_time = measure_time(stmt=stmt_var_np, setup=setup)
std_np_time = measure_time(stmt=stmt_std_np, setup=setup)
median_np_time = measure_time(stmt=stmt_median_np, setup=setup)

# Print results
print("Average execution time for 10,000 operations, seconds:")
print("Function\tCustom\t\tNumpy")
print(f"mean\t\t{mean_custom_time:.6f}\t{mean_np_time:.6f}")
print(f"variance\t{var_custom_time:.6f}\t{var_np_time:.6f}")
print(f"std\t\t{std_custom_time:.6f}\t{std_np_time:.6f}")
print(f"median\t\t{median_custom_time:.6f}\t{median_np_time:.6f}")


Average execution time for 10,000 operations, seconds:
Function	Custom		Numpy
mean		0.000050	0.000018
variance	0.001846	0.000061
std		0.001705	0.000064
median		0.001745	0.000155


Time per 10000 executions, secs

| Func       | Custom | Numpy |
| ---------- | ------ | ----- |
| mean       |        |       |
| var        |        |       |
| std        |        |       |
| median     |        |       |