In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.metrics import accuracy_score

#### Overall working flow
- The simple strategy works as follows:
    - **Logistic regression** model is run on several factors.
    - A **binary** prediction for a positive or negative return is generated on testing data.
    - **Factors** include: 
        - simple daily return; 
        - past 20 days volatility;
        - 20-day moving average return; 
        - MACD; 
        - standardized volume
    - Each factor has its lag for 3 times; overall there are **15** factors.
    - Check if all the factors are useful by comparing **accuracy scores** on individual models
    
- Strategy construction and evaluation:
    - Building a strategy based on the **scale** of positive return probability
    - Check the **evaluation metrics** on the return like average return and sharpe ratio, etc
    - Run the three-factor regression on the portfolio return to check significant **alpha**
    

#### Step1:  Preprocessing--calculating required factors

In [2]:
training_window_size = 720
rolling_window_size = 45
testing_window_size = 45

In [3]:
def MACD(x):
    k = x['last'].ewm(span=12, adjust=False).mean()
    d = x['last'].ewm(span=26, adjust=False).mean()
    macd = k - d
    # use the signal for 9 days
    macd_s = macd.ewm(span=9, adjust=False).mean()
    return macd_s

In [4]:
df = pd.read_csv('data.csv')
# check if there is missing values
print(df.isna().sum())

ticker    0
date      0
last      0
volume    0
dtype: int64


In [5]:
# calculate the factors
df["return"] = df.groupby("ticker")["last"].pct_change(1)
df["volatility"] = df.groupby("ticker")["return"].rolling(20).std().reset_index(level=0, drop=True)
df["moving_avg"] = df.groupby("ticker")["return"].rolling(20).mean().reset_index(level=0, drop=True)
df['macd'] = df.groupby("ticker").apply(MACD).reset_index(level=0, drop=True)

In [6]:
# add the lagged terms of the factors
factor_ls = ['return', 'volatility', 'macd', 'volume', 'moving_avg']
for lag in range(1, 4):
    for factor in factor_ls:
        df[f'{factor}_{lag}'] = df.groupby(['ticker'])[factor].shift(lag)
# df.head(60)

In [7]:
# remove points with na values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
# print(df.isna().sum(), df.head())

#### Step2: Strategy model specification
- **Attempts made during this step:**
    - try all the 15 factor
    - try 5 factors of `return`, `volatility`, `volumes`, `macd`, `moving average`.
- **Evaluation**
    - To compare the accuracy score on the return prediction.
- **Conclusion:**
    - The accuracy increases with fewer factors.
    - Logistic regression is essentially a linear model, the 3 lags of the indicators may have high collinearity, which will negatively affect the fitting result. 

In [8]:
# get the available date range to form training and testing dataset
dateRange = pd.unique(df.date)
# X_ls = df.columns[8:] # all the 15 factors
X_ls = ['volume_1','return_1', 'volatility_1', 'macd_1', 'moving_avg_1']

In [9]:
pred_df_ls = []
scaler = StandardScaler()
for i in range(training_window_size, len(dateRange)-testing_window_size, rolling_window_size):
    trainDate = dateRange[i - training_window_size: i]
    testDate = dateRange[i:i + testing_window_size]
    X_train, X_test = df.loc[df['date'].isin(trainDate)][X_ls], df.loc[df['date'].isin(testDate)]
    # normalizing the volume in each penal    
    X_train[['volume_1']] = scaler.fit_transform(X_train[['volume_1']])
    X_test[['volume_1']] = scaler.fit_transform(X_test[['volume_1']])

    y_train = np.where(df.loc[df['date'].isin(trainDate)]['return'] > 0, 1, 0)
    
    # model fitting and prediction
    model = LogisticRegression(C=1e8, random_state=0).fit(X_train, y_train)
    X_test['binaryPred'] = model.predict(X_test[X_ls])
    X_test['prediction'] = model.predict_proba(X_test[X_ls])[:, 1]
    pred_df_ls.append(X_test[['ticker', 'date', 'return', 'prediction', 'binaryPred']])
    
    print(f'Training from date {dateRange[i - training_window_size]} to date {dateRange[i-1]} starts.')
    print(f'Testing from date {dateRange[i]} to date {dateRange[i+testing_window_size-1]} finished.')
    
# extract prediction result for evaluation
pred_df = pd.concat(pred_df_ls)
pred_df.reset_index(drop=True, inplace=True)


Training from date 2013-02-07 to date 2016-01-18 starts.
Testing from date 2016-01-19 to date 2016-03-23 finished.
Training from date 2013-04-15 to date 2016-03-23 starts.
Testing from date 2016-03-24 to date 2016-05-31 finished.
Training from date 2013-06-20 to date 2016-05-31 starts.
Testing from date 2016-06-01 to date 2016-08-03 finished.
Training from date 2013-08-23 to date 2016-08-03 starts.
Testing from date 2016-08-04 to date 2016-10-11 finished.
Training from date 2013-10-30 to date 2016-10-11 starts.
Testing from date 2016-10-12 to date 2016-12-15 finished.
Training from date 2014-01-09 to date 2016-12-15 starts.
Testing from date 2016-12-16 to date 2017-02-22 finished.
Training from date 2014-03-17 to date 2017-02-22 starts.
Testing from date 2017-02-23 to date 2017-04-27 finished.
Training from date 2014-05-23 to date 2017-04-27 starts.
Testing from date 2017-04-28 to date 2017-07-04 finished.
Training from date 2014-07-28 to date 2017-07-04 starts.
Testing from date 2017-

In [10]:
# check accuracy score
y_test = np.where(pred_df['return'] > 0, 1, 0)
accuracy_score(y_test, pred_df['binaryPred'])

0.5069638473357941

#### Step3: Building daily portfolio
- To check if the strategy is robust, build daily portfolio using all the probabilities
    - Every day long stocks with high predicted probability of positive return; short stocks with low predicted probability of positive return
    - standardize daily return by dividing the book value

In [11]:
portfolioReturn = {'date':[], 'return':[]}
for date in pd.unique(pred_df.date):
    dailyPortfolio = pred_df.loc[pred_df['date']==date]
    dailyPortfolio['weight'] = dailyPortfolio['prediction'] - dailyPortfolio['prediction'].mean()
    
    dailyReturn = (dailyPortfolio['weight'] * dailyPortfolio['return']).sum() / dailyPortfolio['weight'].abs().sum()
    portfolioReturn['date'].append(date)
    portfolioReturn['return'].append(dailyReturn)

    # print(f'{date} portfolio finished building.')
portfolioReturn = pd.DataFrame(portfolioReturn)
# portfolioReturn

#### Performance evaluation
- Basic metrics: average return, standard deviation, sharpe ratio, maximum drawdown
- Regression: 3-factor regression

In [12]:
def compute_maximum_drawdown(df):
        # cumulative_return_pf = np.cumprod(1 + self.rt_sr) - 1
        def drawdown(x):
            max_retrace = 0
            peak = x.iloc[0]
            for i in range(len(x)):
                if x.iloc[i] > peak:
                    peak = x.iloc[i]
                dd = (peak - x.iloc[i]) / (1 + peak)
                if dd > max_retrace:
                    max_retrace = dd
            return max_retrace
        # cumulative return
        cum_rt = (1 + df['return']).cumprod() - 1
        return drawdown(cum_rt)

In [13]:
mktFactor = pd.read_csv('F-F_Research_Data_Factors_daily.csv', skiprows=4, skipfooter=1)
mktFactor.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
mktFactor['date']= mktFactor['date'].astype(str).str[:4] + '-' + \
                   mktFactor['date'].astype(str).str[4:6] + '-' + \
                   mktFactor['date'].astype(str).str[6:]
portfolioReturn = portfolioReturn.merge(mktFactor, on='date')
portfolioReturn.head()

Unnamed: 0,date,return,Mkt-RF,SMB,HML,RF
0,2016-01-19,0.004613,-0.19,-1.34,-0.06,0.0
1,2016-01-20,-0.003452,-0.94,1.88,-1.27,0.0
2,2016-01-21,0.000345,0.45,-0.52,-0.02,0.0
3,2016-01-22,0.007557,2.08,0.21,-0.2,0.0
4,2016-01-25,-0.003398,-1.71,-0.39,-0.99,0.0


In [14]:
avg_return = portfolioReturn['return'].mean()
return_std = portfolioReturn['return'].std()
sharpe = (portfolioReturn['return'] - portfolioReturn['RF'] / 100).mean() / portfolioReturn['return'].std()
maxDrawdown = compute_maximum_drawdown(portfolioReturn)
# portfolioReturn.to_csv('portfolioReturn.csv', index=False)
sharpe = sharpe * np.sqrt(252)
print('Average return is {:.4f}%.'.format(avg_return*100))
print('Standard deviation is {:.4f}.'.format(return_std))
print('Sharpe ratio is {:.4f}.'.format(sharpe))
print('Maximum Drawdown is {:.4f}%.'.format(maxDrawdown*100))

Average return is 0.0207%.
Standard deviation is 0.0042.
Sharpe ratio is 0.6290.
Maximum Drawdown is 9.1578%.


In [15]:
y = portfolioReturn['return'] - portfolioReturn['RF'] / 100
X = sm.add_constant(portfolioReturn[['Mkt-RF', 'SMB', 'HML']])
threeFactor = sm.OLS(y, X)
res = threeFactor.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     6.141
Date:                Fri, 20 Jan 2023   Prob (F-statistic):           0.000383
Time:                        15:56:31   Log-Likelihood:                 4951.6
No. Observations:                1219   AIC:                            -9895.
Df Residuals:                    1215   BIC:                            -9875.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0002      0.000      1.365      0.1

#### Final conclusion
- Although the average return and sharpe ratio are not satisfactory, the portfolio has relatively low risk due to low maximum drawdown and standard deviation.
- From the 3-factor regression, the portfolio return has high correlation with company size factor and company market value factor. This can be monitored during risk management to avoid market risk.
- Finally, the alpha of this portfolio is significant at 10% confidence value. This indicates that more exploration in such direction may be meaningful to create risk-free profits. 