# Stock Data Task 3

## Quantile Analysis & Outlier Detection

In [124]:
import pandas as pd 
import numpy as np 
import math
from statistics import mean

First, we write functions to determine the quartiles and determine the boundaries for the outliers.

In [125]:
# A function to determine whether result of division is an integer 
def is_integer (x):
    return abs(x - math.floor(x)) < 0.001

In [126]:
# First quartile
def quartiles (lt, type = "first"):
    lst = lt.sort_values() 
    n = len(lst)

    if type == "first":
        L = n/4
    elif type == "third":
        L = 0.75*n 
    else:
        raise ValueError("Argument 'type' must be either 'first' (for first quartile) or 'third' (for third quartile)")

    if is_integer(L):
        # Since arrays are 0 indexed, Lth value is at index L-1 
        L = int(L)
        return mean([lst.iloc[L-1], lst.iloc[L]]) 
    else:
        # Round L up 
        L = math.ceil(L) 
        return lst.iloc[L-1] 

In [127]:
# Returns the lower and upper boundary for a given pandas series
def outlier_boundaries (ser):
    q1 = quartiles(ser) 
    q3 = quartiles(ser, type = "third")

    iqr = q3 - q1 
    lower_bdy = q1 - 1.5 * iqr 
    upper_bdy = q3 + 1.5 * iqr 
    return [lower_bdy, upper_bdy]

Now that we have written the necessary funtions, we will use it to identify outliers for OPEN, HIGH, LOW, CLOSE in the stock market data.

In [128]:
stocks = pd.read_csv("spx.csv").reset_index(names = "Day")

In [129]:
open = stocks["Open"] 
high = stocks["High"] 
low = stocks["Low"] 
close = stocks["Close"]

In [130]:
open_lower_bdy, open_upper_bdy = outlier_boundaries(open) 
high_lower_bdy, high_upper_bdy = outlier_boundaries(high) 
low_lower_bdy, low_upper_bdy = outlier_boundaries(low) 
close_lower_bdy, close_upper_bdy = outlier_boundaries(close)

Now that we have gotten the boundaries, we will identify outliers in the spx.csv data. 

In [131]:
ix_open = (stocks["Open"] > open_upper_bdy) | (stocks["Open"] < open_lower_bdy) 
ix_high = (stocks["High"] > high_upper_bdy) | (stocks["High"] < high_lower_bdy)
ix_low = (stocks["Low"] > low_upper_bdy) | (stocks["Low"] < low_lower_bdy) 
ix_close = (stocks["Close"] > close_upper_bdy) | (stocks["Close"] < close_lower_bdy) 

In [132]:
outliers = stocks[(ix_open) | (ix_high) | (ix_low) | (ix_close)]

Now that the outliers have been identified, we will output the results to a CSV file.

In [135]:
outliers.to_csv("outliers.csv", header = True, index = False)