In [None]:
import dautil as dl
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML

In [None]:
lr = dl.nb.LatexRenderer(chapter=3, start=21)
lr.render(r'r_{pb} = \frac{M_1 - M_0}{s_n} \sqrt{ \frac{n_1 n_0}{n^2}}')
lr.render(r's_n = \sqrt{\frac{1}{n} \sum_{i=1}^n (X_i - \overline{X})^2}')

In [None]:
df = dl.data.Weather.load().dropna()
df['RAIN'] = df['RAIN'] > 0

stats_corr = stats.pointbiserialr(df['RAIN'].values, df['TEMP'].values)

In [None]:
N = 2 * 365
corrs = []

for i in range(len(df.index) - N):
    x = np.roll(df['RAIN'].values, i)[:N]
    y = np.roll(df['TEMP'].values, i)[:N]
    corrs.append(stats.pointbiserialr(x, y)[0])

corrs = pd.DataFrame(corrs,
                     index=df.index[N:],
                     columns=['Correlation']).resample('A')

In [None]:
%matplotlib inline
dl.options.mimic_seaborn()
context = dl.nb.Context('correlating_pointbiserial')
dl.nb.RcWidget(context)

In [None]:
plt.plot(corrs.index.values, corrs.values)
plt.hlines(stats_corr[0], corrs.index.values[0], corrs.index.values[-1],
           label='Correlation using the whole data set')
plt.title('Rolling Point-biserial Correlation of Rain and Temperature with a 2 Year Window')
plt.xlabel('Year')
plt.ylabel('Correlation')
plt.legend(loc='best')
HTML(dl.report.HTMLBuilder().watermark())