# Predicting stock price moves with Logistic Regression

## Imports & Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pathlib import Path
import sys, os
from time import time

import pandas as pd
import numpy as np

from scipy.stats import spearmanr

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
pip install utils

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [5]:
# sequential data like time series requires careful cross-validation to be set up so that we do not inadvertently introduce look-ahead bias or leakage
class MultipleTimeSeriesCV: #多重时间序列交叉验证(Cross-validating)
    """Generates tuples of train_idx, test_idx pairs
    Assumes the MultiIndex contains levels 'symbol' and 'date'
    purges overlapping outcomes"""

    def __init__(self, #实例
                 n_splits=3, #实例属性
                 train_period_length=126, 
                 test_period_length=21,
                 lookahead=None, #提前量
                 shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle

    #The split() method returns a generator yielding pairs of train and test indices,
    #which we can then use to select outcomes and features.
    #The number of pairs depends on the parameter n_splits
    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values('date').unique() #获取唯一的日期
        days = sorted(unique_dates, reverse=True) #!!!倒序排列日期，这与后面倒着定义训练集，测试集的坐标相关

        split_idx = []
        for i in range(self.n_splits): #self.n_splits默认为3，即应循环3次
            test_end_idx = i * self.test_length #self.test_length默认为21，test_end_idx为21*i (i = 0,1,2)
            test_start_idx = test_end_idx + self.test_length #test_start_idx = test_end_idx + 21
            train_end_idx = test_start_idx + + self.lookahead - 1 #train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1 #train_start_idx = train_end_idx + 125 + self.lookahead
            split_idx.append([train_start_idx, train_end_idx,
                              test_start_idx, test_end_idx]) #将三次train_test对应的训练集，测试集的坐标保存到split_idx中

        dates = X.reset_index()[['date']] #给X重设index并提取其中的日期项,dates的日期项的顺序是正的，包含了所有股票在所选时间的序列。而days则只包含了不重复的时间数据。
        for train_start, train_end, test_start, test_end in split_idx: #依次读取每次训练的days的数据坐标，注意这些坐标数据是由“远”及“近”的
            train_idx = dates[(dates.date > days[train_start])
                              & (dates.date <= days[train_end])].index #获取train_set的index，其由dates的index而来，因此是正序的，而且这一index的范围囊括了所有对象股
            test_idx = dates[(dates.date > days[test_start])
                             & (dates.date <= days[test_end])].index
            if self.shuffle: #默认为False
                np.random.shuffle(list(train_idx)) #对train_idx中的项(index)进行洗牌
                
            yield train_idx, test_idx
            #yield的作用：返回一个可以用来迭代(for循环)的生成器，它的应用场景通常为一个需要返回一系列值的，含有循环的函数中

    def get_n_splits(self, X, y, groups=None): #返回n_split这个参数，即滚动窗口的次数
        return self.n_splits

In [6]:
sns.set_style('darkgrid')
idx = pd.IndexSlice

In [7]:
YEAR = 252

## Load Data

In [8]:
with pd.HDFStore('data.h5') as store:
    data = (store['model_data']
            .dropna()
            .drop(['open', 'close', 'low', 'high'], axis=1))
data = data.drop([c for c in data.columns if 'year' in c or 'lag' in c], axis=1)

### Select Investment Universe

In [9]:
data = data[data.dollar_vol_rank<100]

### Create Model Data

In [10]:
y = data.filter(like='target')
X = data.drop(y.columns, axis=1)
X = X.drop(['dollar_vol', 'dollar_vol_rank', 'volume', 'consumer_durables'], axis=1)

## Logistic Regression

### Define cross-validation parameters

In [11]:
train_period_length = 63
test_period_length = 10
lookahead =1
n_splits = int(3 * YEAR/test_period_length)

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          test_period_length=test_period_length,
                          lookahead=lookahead,
                          train_period_length=train_period_length)

In [12]:
target = f'target_{lookahead}d'

In [13]:
y.loc[:, 'label'] = (y[target] > 0).astype(int)
y.label.value_counts()

1    56486
0    53189
Name: label, dtype: int64

In [14]:
Cs = np.logspace(-5, 5, 11)
Cs

array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])

In [15]:
cols = ['C', 'date', 'auc', 'ic', 'pval']

### Run cross-validation

In [None]:
%%time
log_coeffs, log_scores, log_predictions = {}, [], []
for C in Cs:
    print(C)
    model = LogisticRegression(C=C, #C：正则化强度的倒数，必须是一个大于0的浮点数，不填写默认1.0，即默认正则项与损失函数的比值是1：1。
                               #C越小，损失函数会越小，模型对损失函数的惩罚越重，正则化的效力越强，参数会逐渐被压缩得越来越小。
                               fit_intercept=True,
                               random_state=42,
                               n_jobs=-1)

    pipe = Pipeline([
        ('scaler', StandardScaler()), #加入了正则化项来提高模型的泛化性，而使用正则化必须要进行标准化操作，从而使参数的数量级一致
        ('model', model)])
    ics = aucs = 0
    start = time()
    coeffs = []
    for i, (train_idx, test_idx) in enumerate(cv.split(X), 1):
        X_train, y_train, = X.iloc[train_idx], y.label.iloc[train_idx]
        pipe.fit(X=X_train, y=y_train)
        X_test, y_test = X.iloc[test_idx], y.label.iloc[test_idx]
        actuals = y[target].iloc[test_idx]
        if len(y_test) < 10 or len(np.unique(y_test)) < 2:
            continue
        y_score = pipe.predict_proba(X_test)[:, 1]
       
        auc = roc_auc_score(y_score=y_score, y_true=y_test)
        actuals = y[target].iloc[test_idx]
        ic, pval = spearmanr(y_score, actuals)

        log_predictions.append(y_test.to_frame('labels').assign(
            predicted=y_score, C=C, actuals=actuals))
        date = y_test.index.get_level_values('date').min()
        log_scores.append([C, date, auc, ic * 100, pval])
        coeffs.append(pipe.named_steps['model'].coef_)
        ics += ic
        aucs += auc
        if i % 10 == 0:
            print(f'\t{time()-start:5.1f} | {i:03} | {ics/i:>7.2%} | {aucs/i:>7.2%}')

    log_coeffs[C] = np.mean(coeffs, axis=0).squeeze()

1e-05
	 52.0 | 010 |  -0.31% |  50.42%
	 58.7 | 020 |   1.89% |  51.83%
	 65.5 | 030 |   2.84% |  52.01%
	 72.6 | 040 |   3.29% |  51.98%
	 79.7 | 050 |   3.97% |  52.44%
	 86.6 | 060 |   3.96% |  52.27%
	 94.4 | 070 |   4.73% |  52.59%
0.0001
	  8.1 | 010 |  -0.06% |  50.62%
	 17.6 | 020 |   2.23% |  52.01%
	 26.1 | 030 |   3.20% |  52.26%
	 33.4 | 040 |   3.34% |  52.08%
	 40.7 | 050 |   4.02% |  52.53%
	 47.9 | 060 |   4.02% |  52.33%
	 55.5 | 070 |   4.83% |  52.67%
0.001
	 10.4 | 010 |   0.42% |  50.96%
	 18.0 | 020 |   2.53% |  52.14%
	 26.0 | 030 |   3.58% |  52.48%
	 33.9 | 040 |   3.17% |  52.07%
	 41.4 | 050 |   3.83% |  52.49%
	 49.1 | 060 |   4.03% |  52.33%
	 56.6 | 070 |   4.88% |  52.70%
0.01
	  8.7 | 010 |   0.68% |  51.13%
	 16.5 | 020 |   2.39% |  51.97%
	 24.4 | 030 |   3.64% |  52.41%
	 32.2 | 040 |   3.12% |  51.94%
	 40.3 | 050 |   3.92% |  52.46%
	 48.8 | 060 |   4.16% |  52.30%
	 56.2 | 070 |   4.91% |  52.64%
0.1
	 20.0 | 010 |   0.65% |  51.11%
	 33.4 | 020 | 

### Evaluate Results

In [None]:
log_scores = pd.DataFrame(log_scores, columns=cols)
log_scores.to_hdf('data.h5', 'logistic/scores')

log_coeffs = pd.DataFrame(log_coeffs, index=X.columns).T
log_coeffs.to_hdf('data.h5', 'logistic/coeffs')

log_predictions = pd.concat(log_predictions)
log_predictions.to_hdf('data.h5', 'logistic/predictions')

In [None]:
log_scores = pd.read_hdf('data.h5', 'logistic/scores')

In [None]:
log_scores.info()

In [None]:
log_scores.groupby('C').auc.describe()

### Plot Validation Scores

In [None]:
def plot_ic_distribution(df, ax=None):
    if ax is not None:
        sns.distplot(df.ic, ax=ax)    
    else:
        ax = sns.distplot(df.ic)
    mean, median = df.ic.mean(), df.ic.median()
    ax.axvline(0, lw=1, ls='--', c='k')
    ax.text(x=.05, y=.9, s=f'Mean: {mean:8.2f}\nMedian: {median:5.2f}',
            horizontalalignment='left',
            verticalalignment='center',
            transform=ax.transAxes)
    ax.set_xlabel('Information Coefficient')
    sns.despine()
    plt.tight_layout()

In [None]:
fig, axes= plt.subplots(ncols=2, figsize=(15, 5))

sns.lineplot(x='C', y='auc', data=log_scores, estimator=np.mean, label='Mean', ax=axes[0])
by_alpha = log_scores.groupby('C').auc.agg(['mean', 'median'])
best_auc = by_alpha['mean'].idxmax()
by_alpha['median'].plot(logx=True, ax=axes[0], label='Median', xlim=(10e-6, 10e5))
axes[0].axvline(best_auc, ls='--', c='k', lw=1, label='Max. Mean')
axes[0].axvline(by_alpha['median'].idxmax(), ls='-.', c='k', lw=1, label='Max. Median')
axes[0].legend()
axes[0].set_ylabel('AUC')
axes[0].set_xscale('log')
axes[0].set_title('Area Under the Curve')

plot_ic_distribution(log_scores[log_scores.C==best_auc], ax=axes[1])
axes[1].set_title('Information Coefficient')

fig.suptitle('Logistic Regression', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9);