In [1]:
import altair as alt
import numpy as np
import pandas as pd
import math
import vega
import random
import time
from IPython.display import display
import ipywidgets as widgets
import asyncio

In [2]:
dataframe = pd.read_csv("./data/penguins.csv")
data = dataframe.sample(n=len(dataframe), random_state=42).reset_index(drop=True)
# data = dataframe.reindex(np.random.permutation( .... series.index ....))
data = list(map(lambda item: [item[0], item[len(item)-2]], data.values))

In [3]:
data2 = pd.read_json("./data/summary.json")

In [4]:
def getBoxPlot(summaryData):
    base = alt.Chart(summaryData).encode(y="Species:N")

    ruleChart = base.mark_rule().encode(
        x = "lower:Q",
        x2 = "upper:Q"
    )
    # display(ruleChart)

    barChart = base.mark_bar(
        height = 14
    ).encode(
        x = "q1:Q",
        x2 = "q3:Q",
        color = "Species:N"
    )

    tickChart = base.mark_tick(
        color = "white",
        size = 14
    ).encode(
        x = "median:Q"
    )

    pointChart = base.mark_point().transform_flatten(['outliers']).encode(
        x = "outliers:Q"
    )

    return ruleChart + barChart + tickChart + pointChart

In [5]:
def quantileSorted(arrSorted, q):
    length = len(arrSorted)
    base = length * q - 0.5
    first = arrSorted[math.ceil(base)]
    second = arrSorted[math.floor(base)]
    return (first + second) / 2

In [6]:

def calculateSummary(key, sortedArr):
    if len(sortedArr) <= 0:
        return {
            "Species": key,
            "lower": 0,
            "q1": 0,
            "median": 0,
            "q3": 0,
            "upper": 0,
            "outliers": 0, 
        }
    q1 = quantileSorted(sortedArr, 0.25)
    median = quantileSorted(sortedArr, 0.5)
    q3 = quantileSorted(sortedArr, 0.75)
    
    iqr = q3 - q1
    top = q3 + 1.5 * iqr
    bottom = q1 - 1.5 * iqr
    upper = sortedArr[0]
    lower = sortedArr[0]
    outliers = []
    for i in range(len(sortedArr)):
        curNum = sortedArr[i]
        if (curNum < bottom) or (curNum > top):
            outliers.append(curNum)
        elif curNum > upper:
            upper = curNum
        elif curNum < lower:
            lower = curNum
    
    return {
        "Species": key,
        "lower": lower,
        "q1": q1,
        "median": median,
        "q3": q3,
        "upper": upper,
        "outliers": outliers,
    }


In [7]:
def group(data):
    res = {'Adelie': [], 'Chinstrap': [], 'Gentoo': []}
    for item in data:
        species = item[0];
        res[species].append(item[1])
    return res

In [8]:
def mergeSortedData(oldData, newData):
    newLength = len(newData)
    if (newLength == 0):
        return oldData
    oldLength = len(oldData)
    if(oldLength == 0):
        return newData
    result = []
    m = 0
    n = 0
    while m < oldLength and n < newLength:
        if oldData[m] <= newData[n]:
            result.append(oldData[m])
            m += 1
        else:
            result.append(newData[n])
            n += 1
    
    if m >= oldLength:
        result.append(newData[n])
        n += 1
        while n < newLength:
            result.append(newData[n])
            n += 1
    elif n >= newLength:
        result.append(oldData[m])
        m += 1
        while m < oldLength:
            result.append(oldData[m])
            m += 1
    
    return result


In [9]:
mergeSortedData([1, 3, 4], [2, 6])

[1, 2, 3, 4, 6]

In [10]:

    
def mergeData(dataMerged, dataSegment):
    for key in dataMerged.keys():
        valuesOld = dataMerged[key]
        valuesNew = dataSegment[key]
        valuesNew.sort()
        # sorted(valuesNew, key=lambda a,b: a-b)
        dataMerged[key] = mergeSortedData(valuesOld, valuesNew)
    return dataMerged


In [11]:
out = widgets.Output(layout={'border': '1px solid black', "height": '200px'})
out

Output(layout=Layout(border='1px solid black', height='200px'))

In [12]:
def summaryData(dataGrouped):
    summary = [];
    for key in dataMerged.keys():
        masses = dataMerged[key]
        summary.append(calculateSummary(key, masses))
    return summary

In [13]:
batchSize = 30
itrNum = math.ceil(len(data) / batchSize)
dataMerged = {'Adelie': [], 'Chinstrap': [], 'Gentoo': []}

for itr in range(itrNum):
#     reply = input("continue? Y or N: ")
#     if reply.upper() != 'Y':
#         break;
    dataSegment = data[itr * batchSize: (itr+1) * batchSize]
    dataGrouped = group(dataSegment)
    dataMerged = mergeData(dataMerged, dataGrouped)
    summary = summaryData(dataMerged)
    
    summary = alt.Data(values = summary)
    time.sleep(1)
    with out:
        out.clear_output()
        display(getBoxPlot(summary))

In [14]:
a = getBoxPlot(data2)
a.__repr__

<bound method TopLevelMixin.__repr__ of alt.LayerChart(...)>

In [15]:
display(a)