In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/metabric.csv') # Metabric Cancer Data
df.head()

In [None]:
# Q1
x = df['Age at Diagnosis']

# Naive double loops
def L(x):
    Z = np.sort(x.unique()) # Extract and sort unique values for x
    L = np.zeros((len(Z),len(x)))
    for j in range(len(Z)):
        for i in range(len(x)):
            L[j,i] = (x[i] <= Z[j])
    return L

L(x)

In [None]:
# Q2
def compute_quantile(data, q=0.5):
    data = np.sort(data)
    pos = (len(data) - 1) * q
    lower = int(np.floor(pos))
    upper = int(np.ceil(pos))
    if lower == upper:
        return data[int(pos)]
    return data[lower] + (pos - lower) * (data[upper] - data[lower])

for i in [0.1, 0.25, 0.5, 0.75, 0.9]:
    print(i, compute_quantile(df['Age at Diagnosis'], i), np.percentile(df['Age at Diagnosis'], i * 100))

In [None]:
# Q3
def calc_IQR(data):
    q1 = compute_quantile(data, 0.25)
    q3 = compute_quantile(data, 0.75)
    iqr = q3 - q1
    left_whisker = q1 - 1.5 * iqr
    right_whisker = q3 + 1.5 * iqr
    return [iqr, left_whisker, right_whisker]

print (calc_IQR(df["Age at Diagnosis"]))

In [None]:
# Q4
def five_num_summary(data):
    q1 = compute_quantile(data, 0.25)
    q3 = compute_quantile(data, 0.75)
    median = compute_quantile(data, 0.5)
    whisker_high = min(q3 + 1.5 * (q3 - q1), data.max())
    whisker_low = max(q1 - 1.5 * (q3 - q1), data.min())
    return [data.min(), whisker_low, q1, median, q3, whisker_high, data.max()]

print(five_num_summary(df["Age at Diagnosis"]))

In [None]:
# Q5
print(five_num_summary(df["Age at Diagnosis"]))
sns.boxplot(x="Cancer Type", y="Age at Diagnosis", data=df)

In [None]:
# Q6
def find_outliers(data):
    q1 = compute_quantile(data, 0.25)
    q3 = compute_quantile(data, 0.75)
    return (data < (q1 - 1.5 * (q3 - q1))) | (data > (q3 + 1.5 * (q3 - q1)))

for a in find_outliers(df["Age at Diagnosis"]):
    print(a)