In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from ipywidgets import interact
import ipywidgets as widgets
import warnings
from stock_segmentation import *

warnings.filterwarnings('ignore')

%matplotlib inline

Global settings:

In [2]:
# data_url = "./stock_data_cim.csv"
data_url = "https://dl.dropbox.com/s/vkomzp2p1jiu8zv/stock_data.csv"
date_format = "%m/%d/%Y"

Read data (requires date in format "mm/dd/yyyy"):

In [3]:
df = pd.read_csv(data_url, index_col=0)

dates = [dt.datetime.strptime(d, date_format) for d in df.Open.index]
df["DayIndex"] = [(d - dates[0]).days for d in dates]

x = df.DayIndex.values.astype(float)
y = df.Open.values

Optimal smoothing parameter:

In [4]:
%time smooth_min, smooth_opt, smooth_max = optimal_piecewise_linear(x, y, 1000, granularity_penalty=0.05, verbose=True)[0]

Estimating minimal smoothing. Step 2, smoothing 20000
Estimating minimal smoothing. Step 4, smoothing 133333.33333333334
Estimating minimal smoothing. Step 6, smoothing 59259.25925925926
Estimating minimal smoothing. Step 8, smoothing 26337.448559670782
Minimal smoothing: 26337.448559670782, maximal smoothing: 2948170.81
Maximal granularity: 76, maximal MAD: 12.03
Step 0. Min penalty 0.15530276229035878
Step 1. Min penalty 0.15435192394962696
CPU times: user 29.9 s, sys: 0 ns, total: 29.9 s
Wall time: 29.9 s


In [5]:
plot_regression(x, y, smooth_opt, legend_position="top_right");

Select smoothing parameter by hands:

In [6]:
@interact(log_smooth=widgets.FloatSlider(min=np.log10(smooth_min), max=np.log10(smooth_max), 
                                         step=0.01, value=np.log10(smooth_opt), continuous_update=False))
def p(log_smooth):
    plot_regression(x, y, 10**log_smooth)

Save segments (choose your smoothing instead of *"smooth_opt"*):

In [7]:
rs = piecewise_linear(x, y, smoothing=smooth_opt, refine=True, verbose=False)[0]
breaking_points = [r[1] for r in rs] + [rs[-1][2]]
breaking_points = refine_optimize(x, y, breaking_points)
rs, xs, ys = piecewise_linear(x, y, breaking_points=breaking_points, refine=True, verbose=False)

In [8]:
res_df = pd.DataFrame(np.array(ys), columns=["price_start", "price_end"])
res_df["date_start"] = [(dates[0] + dt.timedelta(days=d)).strftime(date_format) for d in xs[:-1]]
res_df["date_end"] = [(dates[0] + dt.timedelta(days=d)).strftime(date_format) for d in xs[1:]]
res_df.to_csv("stock_segments.csv")
res_df

Unnamed: 0,price_start,price_end,date_start,date_end
0,36.580108,43.044155,08/08/1997,10/07/1997
1,43.044155,35.473582,10/07/1997,11/12/1997
2,35.473582,22.614718,11/12/1997,02/05/1998
3,22.614718,20.66247,02/05/1998,07/28/1998
4,20.66247,6.290056,07/28/1998,03/11/1999
5,6.290056,10.898328,03/11/1999,04/30/1999
6,10.898328,8.619993,04/30/1999,01/31/2000
7,8.619993,18.55155,01/31/2000,06/26/2001
8,18.55155,19.871284,06/26/2001,02/11/2002
9,19.871284,29.030754,02/11/2002,12/01/2003
