In [None]:
import sys

sys.path.append("../")
sys.path.append("Data_Science")


In [2]:
import random
from typing import TypeVar, List, Tuple

# Splitting Data Sets

In [3]:
X = TypeVar("X")

def split_data(data: List[X], prob:float) -> Tuple[List[X], List[X]]:
    """Split data into fractions [prob, 1-prob]"""
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * prob)
    return data[:cut], data[cut:]

data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

assert len(train) == 750
assert len(test) == 250

assert sorted(train + test) == data


## Pairing with output variables

In [4]:
Y = TypeVar("Y")

def train_test_split(xs: List[X], ys: List[Y], test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    return ([xs[i] for i in train_idxs],
            [xs[i] for i in test_idxs],
            [ys[i] for i in train_idxs],
            [ys[i] for i in test_idxs])

In [5]:
xs = [x for x in range(1000)]
ys = [2 * x for x in xs]

x_train, x_test, y_train, t_test = train_test_split(xs, ys, 0.25)

print(len(x_train), len(x_test), len(y_train), len(t_test))

750 250 750 250


# Correctness

In [7]:
def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
    correct = tp + tn
    total = tp +fp + fn + tn
    return correct/total

assert accuracy(70, 4930, 13930, 981070) == 0.98114

In [11]:
def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fp)

assert precision(70, 4930, 13930, 981070) == 0.014

def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fn)

assert recall(70, 4930, 13930, 981070) == 0.005

def f1_score(tp: int, fp: int, fn:int, tn: int) -> float:
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)

    return 2 * p * r / (p + r)

print(f1_score(70, 4930, 13930, 981070))

0.00736842105263158
