<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Statistics (2)

Dr. Yves J. Hilpisch

The Python Quants GmbH

<a href='http://fpq.io'>http://fpq.io</a> | <a href='mailto:team@tpq.io'>team@tpq.io</a>

## Principal Component Analysis

In [None]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from pandas_datareader import data as web
from sklearn.decomposition import KernelPCA

### DAX Index and its 30 Stocks

In [None]:
symbols = ['ADS.DE', 'ALV.DE', 'BAS.DE', 'BAYN.DE', 'BEI.DE',
           'BMW.DE', 'CBK.DE', 'CON.DE', 'DAI.DE', 'DB1.DE',
           'DBK.DE', 'DPW.DE', 'DTE.DE', 'EOAN.DE', 'FME.DE',
           'FRE.DE', 'HEI.DE', 'HEN3.DE', 'IFX.DE', 'LHA.DE',
           'LIN.DE', 'LXS.DE', 'MRK.DE', 'MUV2.DE', 'RWE.DE',
           'SAP.DE', 'SDF.DE', 'SIE.DE', 'TKA.DE', 'VOW3.DE',
           '^GDAXI']

In [None]:
%%time
try:
    h5 = pd.HDFStore('dax.h5')
    data = h5['dax']
    h5.close
except:
    data = pd.DataFrame()
    for sym in symbols:
        data[sym] = web.DataReader(sym, data_source='yahoo')['Close']
    data = data.dropna()
    h5 = pd.HDFStore('dax.h5')
    h5['dax'] = data
    h5.close

In [None]:
dax = pd.DataFrame(data.pop('^GDAXI'))

In [None]:
data[data.columns[:6]].head()

### Applying PCA

In [None]:
scale_function = lambda x: (x - x.mean()) / x.std()

In [None]:
pca = KernelPCA().fit(data.apply(scale_function))

In [None]:
len(pca.lambdas_)

In [None]:
pca.lambdas_[:10].round()

In [None]:
get_we = lambda x: x / x.sum()

In [None]:
get_we(pca.lambdas_)[:10]

In [None]:
get_we(pca.lambdas_)[:5].sum()

### Constructing a PCA Index

In [None]:
pca = KernelPCA(n_components=1).fit(data.apply(scale_function))
dax['PCA_1'] = pca.transform(-data)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
dax.apply(scale_function).plot(figsize=(8, 4))

In [None]:
pca = KernelPCA(n_components=5).fit(data.apply(scale_function))
pca_components = pca.transform(data)
weights = get_we(pca.lambdas_)
dax['PCA_5'] = np.dot(pca_components, weights)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
dax.apply(scale_function).plot(figsize=(8, 4))

In [None]:
import matplotlib as mpl
mpl_dates = mpl.dates.date2num(data.index.to_pydatetime())
mpl_dates

In [None]:
plt.figure(figsize=(8, 4))
plt.scatter(dax['PCA_5'], dax['^GDAXI'], c=mpl_dates, cmap=plt.cm.coolwarm)
lin_reg = np.polyval(np.polyfit(dax['PCA_5'],
                                dax['^GDAXI'], 1),
                                dax['PCA_5'])
plt.plot(dax['PCA_5'], lin_reg, 'r', lw=3)
plt.grid(True)
plt.xlabel('PCA_5')
plt.ylabel('^GDAXI')
plt.colorbar(ticks=mpl.dates.DayLocator(interval=250),
                format=mpl.dates.DateFormatter('%d %b %y'))

In [None]:
cut_date = '2011/7/1'
early_pca = dax[dax.index < cut_date]['PCA_5']
early_reg = np.polyval(np.polyfit(early_pca,
                dax['^GDAXI'][dax.index < cut_date], 1),
                early_pca)

In [None]:
late_pca = dax[dax.index >= cut_date]['PCA_5']
late_reg = np.polyval(np.polyfit(late_pca,
                dax['^GDAXI'][dax.index >= cut_date], 1),
                late_pca)

In [None]:
plt.figure(figsize=(8, 4))
plt.scatter(dax['PCA_5'], dax['^GDAXI'], c=mpl_dates, cmap=plt.cm.coolwarm)
plt.plot(early_pca, early_reg, 'r', lw=3)
plt.plot(late_pca, late_reg, 'r', lw=3)
plt.grid(True)
plt.xlabel('PCA_5')
plt.ylabel('^GDAXI')
plt.colorbar(ticks=mpl.dates.DayLocator(interval=250),
                format=mpl.dates.DateFormatter('%d %b %y'))

## Bayesian Regression &mdash; Basic Example (1)

In [None]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [None]:
x = np.linspace(0, 10, 500)
y = 4 + 2 * x + np.random.standard_normal(len(x)) * 2

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x, y, marker='o')
# plt.colorbar()
plt.grid(True)
plt.xlabel('x')
plt.ylabel('y');

In [None]:
# linear regression
reg = np.polyfit(x, y, 1)

In [None]:
reg

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x, y, marker='o')
plt.plot(x, reg[1] + reg[0] * x, 'r', lw=3.0)
plt.grid(True)
plt.xlabel('x')
plt.ylabel('y');

In [None]:
from pymc3 import Model, Normal, Uniform, HalfNormal

In [None]:
model = Model()
with model: 
        # model specifications in PyMC3
        # are wrapped in a with-statement
    # define priors
    alpha = Normal('alpha', mu=0, sd=20)
    beta = Normal('beta', mu=0, sd=20)
    sigma = HalfNormal('sigma', sd=1)
    
    # define linear regression
    y_est = alpha + beta * x
    
    # define likelihood
    y_obs = Normal('y', mu=y_est, sd=sigma, observed=y)

In [None]:
from pymc3 import find_MAP

In [None]:
map_estimate = find_MAP(model=model)

In [None]:
map_estimate

In [None]:
from pymc3 import NUTS, sample

In [None]:
# inference
with model: 
    start = find_MAP()
      # find starting value by optimization
    step = NUTS(state=start)
      # instantiate MCMC sampling algorithm
    trace = sample(250, step, start=start, progressbar=True)
      # draw 250 posterior samples using NUTS sampling

In [None]:
trace[0]

In [None]:
trace[1]

In [None]:
from pymc3 import traceplot

In [None]:
traceplot(trace, lines={'alpha': 4, 'beta': 2, 'sigma': 2});

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x, y, marker='o', alpha=0.5)
plt.xlabel('x')
plt.ylabel('y')
for i in range(len(trace)):
    plt.plot(x, trace['alpha'][i] + trace['beta'][i] * x)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x, y, marker='o', alpha=0.5)
plt.xlabel('x')
plt.ylabel('y')
for i in range(len(trace)):
    plt.plot(x, trace['alpha'][i] + trace['beta'][i] * x)
plt.xlim(0, 2)
plt.ylim(0, 10)

## Bayesian Regression &mdash; Basic Example (2)

Cf. http://pymc-devs.github.io/pymc3/notebooks/getting_started.html

### The Data

In [None]:
# initialize random number generator
np.random.seed(123)

In [None]:
# true parameter values
alpha, sigma = 1, 1
beta = [1, 2.5]

In [None]:
# size of dataset
size = 100

In [None]:
# predictor variable
X1 = np.linspace(0, 1, size)
X2 = np.linspace(0, 0.2, size)

In [None]:
# simulate custom variable
Y = alpha + beta[0]*X1 + beta[1]*X2 + np.random.randn(size)*sigma

In [None]:
plt.subplot(211)
plt.plot(X1, Y, '.')
plt.subplot(212)
plt.plot(X2, Y, '.')
plt.xlim(0, 1);

### The Model

In [None]:
from pymc3 import Model, Normal, HalfNormal

In [None]:
basic_model = Model()

with basic_model:
    # priors of the unknown model parameters
    alpha = Normal('alpha', mu=0, sd=10)
    beta = Normal('beta', mu=0, sd=10, shape=2)
    sigma = HalfNormal('sigma', sd=1)
    
    # expected value of outcome
    mu = alpha + beta[0]*X1 + beta[1]*X2
    
    # likelihood (sampling distribution) of observations
    Y_obs = Normal('Y_obs', mu=mu, sd=sigma, observed=Y)

### Model Fitting

In [None]:
from pymc3 import find_MAP

In [None]:
%time map_estimate = find_MAP(model=basic_model)

In [None]:
print(map_estimate)

In [None]:
from scipy import optimize

In [None]:
%time map_estimate = find_MAP(model=basic_model, fmin=optimize.fmin_powell)

In [None]:
print(map_estimate)

### Sampling 

In [None]:
from pymc3 import NUTS, sample, trace

In [None]:
%%time
with basic_model:
    # obtain starting values via MAP
    start = find_MAP(fmin=optimize.fmin_powell)
    
    # instantiate sampler
    step = NUTS(scaling=start)
    
    # draw 250 posterior samples
    trace = sample(250, step, start=start)

In [None]:
from pymc3 import traceplot

In [None]:
traceplot(trace);

### Summary

In [None]:
from pymc3 import summary

In [None]:
trace.varnames

In [None]:
summary(trace)

## Bayesian Regression &mdash; Stochastic Volatility

In [None]:
from pandas_datareader import data as web

In [None]:
try:
    h5 = pd.HDFStore('spx.h5')
    data = h5['spx']
    h5.close
except:
    data = web.DataReader('^GSPC', data_source='yahoo',
                      start='2008-1-1', end='2009-12-31')
    h5 = pd.HDFStore('spx.h5')
    h5['spx'] = data
    h5.close

In [None]:
data['Close'].plot(figsize=(10, 6));

In [None]:
rets = np.log(data['Close'] / data['Close'].shift(1))

In [None]:
rets.hist(bins=35);

In [None]:
from pymc3 import Exponential, StudentT, exp, Deterministic

In [None]:
from pymc3.distributions.timeseries import GaussianRandomWalk

In [None]:
%%time
with Model() as sp500_model:
    nu = Exponential('nu', 1./10, testval=5.0)
    sigma = Exponential('sigma', 1./.02, testval=.1)
    s = GaussianRandomWalk('s', sigma**-2, shape=len(rets))
    volatility_process = Deterministic('volatility_process', exp(-2*s))
    r = StudentT('r', nu, lam=1/volatility_process, observed=rets)

In [None]:
%%time
with sp500_model:
    start = find_MAP(vars=[s], fmin=optimize.fmin_l_bfgs_b)
    
    step = NUTS(scaling=start)
    trace = sample(100, step, progressbar=True)
    
    # start next run at the last sample's position
    step = NUTS(scaling=trace[-1], gamma=0.25)
    trace = sample(200, step, start=trace[-1], progressbar=True, njobs=1)

In [None]:
traceplot(trace, [nu, sigma]);

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
rets.plot(ax=ax, alpha=0.5)
ax.plot(rets.index, 1 / np.exp(trace['s', ::30].T), 'r', alpha=.3);
ax.set(title='volatility process', xlabel='time', ylabel='volatility');
ax.legend(['S&P500', 'stochastic volatility process']);

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="mailto:yves@tpq.io">yves@tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="http://hilpisch.com" target="_blank">http://hilpisch.com</a> 

**Quant Platform** &mdash; <a href="http://quant-platform.com" target="_blank">http://quant-platform.com</a>

**Python for Finance** &mdash; <a href="http://python-for-finance.com" target="_blank">http://python-for-finance.com</a>

**Derivatives Analytics with Python** &mdash; <a href="http://derivatives-analytics-with-python.com" target="_blank">http://derivatives-analytics-with-python.com</a>

**Python Trainings** &mdash; <a href="http://training.tpq.io" target="_blank">http://training.tpq.io</a>