In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize)

# default library imports

In [2]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
Smarket = load_data("Smarket")
Smarket

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...,...
1245,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1246,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1247,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1248,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [4]:
Smarket.columns

Index(['Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today',
       'Direction'],
      dtype='object')

In [5]:
Smarket.corr()

ValueError: could not convert string to float: 'Up'

- Here the corr will return an error because there's a qualitative predictor - Direction.
- By that, I attempted to drop the "Direction" column and get the correlation matrix.

In [6]:
df = Smarket.drop(columns=["Direction"])
df.corr()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0


In [7]:
# Build the model 

allvars = Smarket.drop(columns=["Year" , "Today" , "Direction"])
X = MS(allvars).fit_transform(Smarket)
y = Smarket["Direction"] == "Up"
glm = sm.GLM(y , X , family = sm.families.Binomial())
result = glm.fit()
summarize(result)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-0.126,0.241,-0.523,0.601
Lag1,-0.0731,0.05,-1.457,0.145
Lag2,-0.0423,0.05,-0.845,0.398
Lag3,0.0111,0.05,0.222,0.824
Lag4,0.0094,0.05,0.187,0.851
Lag5,0.0103,0.05,0.208,0.835
Volume,0.1354,0.158,0.855,0.392


In [8]:
result.params

intercept   -0.126000
Lag1        -0.073074
Lag2        -0.042301
Lag3         0.011085
Lag4         0.009359
Lag5         0.010313
Volume       0.135441
dtype: float64

In [9]:
result.pvalues

intercept    0.600700
Lag1         0.145232
Lag2         0.398352
Lag3         0.824334
Lag4         0.851445
Lag5         0.834998
Volume       0.392404
dtype: float64

In [10]:
probs = result.predict()
probs[:10]

array([0.50708413, 0.48146788, 0.48113883, 0.51522236, 0.51078116,
       0.50695646, 0.49265087, 0.50922916, 0.51761353, 0.48883778])

In [11]:
labels = np.array(['Down'] * len(Smarket['Lag1']))
labels[probs > 0.5] = "Up"
labels

array(['Up', 'Down', 'Down', ..., 'Up', 'Up', 'Up'],
      shape=(1250,), dtype='<U4')

In [12]:
confusion_table(labels , Smarket['Direction'])

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,145,141
Up,457,507


In [13]:
(145 + 507) / 1250 , np.mean(labels == Smarket['Direction'])

(0.5216, np.float64(0.5216))

In [14]:
train = (Smarket.Year < 2005)
Smarket_train = Smarket.loc[train]
Smarket_test = Smarket.loc[~train]
Smarket_test

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
998,2005,-0.134,0.008,-0.007,0.715,-0.431,0.78690,-0.812,Down
999,2005,-0.812,-0.134,0.008,-0.007,0.715,1.51080,-1.167,Down
1000,2005,-1.167,-0.812,-0.134,0.008,-0.007,1.72100,-0.363,Down
1001,2005,-0.363,-1.167,-0.812,-0.134,0.008,1.73890,0.351,Up
1002,2005,0.351,-0.363,-1.167,-0.812,-0.134,1.56910,-0.143,Down
...,...,...,...,...,...,...,...,...,...
1245,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1246,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1247,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1248,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [15]:
X_train = X.loc[train]
y_train = y.loc[train]
X_test = X.loc[~train]
y_test = y.loc[~train]

glm_train = sm.GLM(y_train , X_train , family = sm.families.Binomial())
result = glm_train.fit()
summarize(result)
probs = result.predict(exog=X_test)

In [16]:
D = Smarket.Direction
D_train , D_test = D.loc[train] , D.loc[~train]

In [17]:
labels = np.array(["Down"] * 252)
labels[probs > 0.5] = "Up"
confusion_table(labels , D_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,77,97
Up,34,44


In [18]:
np.mean(labels == D_test) , np.mean(labels != D_test)

(np.float64(0.4801587301587302), np.float64(0.5198412698412699))

In [19]:
X = MS(['Lag1' , 'Lag2']).fit_transform(Smarket)

X_train = X.loc[train]
y_train = y.loc[train]
X_test = X.loc[~train]
y_test = y.loc[~train]

glm_train = sm.GLM(y_train , X_train , family = sm.families.Binomial())
result = glm_train.fit()
summarize(result)
probs = result.predict(exog=X_test)

In [20]:
D = Smarket.Direction
D_train , D_test = D.loc[train] , D.loc[~train]

In [21]:
labels = np.array(["Down"] * 252)
labels[probs > 0.5] = "Up"
confusion_table(labels , D_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,35,35
Up,76,106


In [22]:
np.mean(labels == D_test) , np.mean(labels != D_test)

(np.float64(0.5595238095238095), np.float64(0.44047619047619047))

In [23]:
X_test = pd.DataFrame({ 'Lag1' : [1.2 , 1.5],
                        'Lag2' : [1.1 , -0.8]})
model = MS(['Lag1' , 'Lag2']).fit(Smarket)
test_data = model.transform(X_test)
result.predict(test_data)


0    0.479146
1    0.496094
dtype: float64

In [24]:
lda = LDA(store_covariance=True)

In [25]:
train = Smarket.Year < 2005
X = MS(['Lag1' , 'Lag2']).fit_transform(Smarket)
X_train , X_test = X.loc[train] , X.loc[~train]

D = Smarket.Direction
D_train , D_test = D.loc[train] , D.loc[~train]

X_train , X_test = [M.drop(columns = ['intercept']) for M in [X_train , X_test]]
lda.fit(X_train , D_train)

0,1,2
,solver,'svd'
,shrinkage,
,priors,
,n_components,
,store_covariance,True
,tol,0.0001
,covariance_estimator,


In [26]:
lda.means_

array([[ 0.04279022,  0.03389409],
       [-0.03954635, -0.03132544]])

In [27]:
lda.classes_

array(['Down', 'Up'], dtype='<U4')

In [28]:
lda.priors_

array([0.49198397, 0.50801603])

In [29]:
lda.scalings_

array([[-0.64201904],
       [-0.51352928]])

In [30]:
lda_pred = lda.predict(X_test)
confusion_table(lda_pred , D_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,35,35
Up,76,106


In [31]:
lda_prob = lda.predict_proba(X_test)
np.all(
    np.where(lda_prob[:,1] >= 0.5, 'Up','Down') == lda_pred
)

np.True_

In [32]:
np.all(
    [lda.classes_[i] for i in np.argmax(lda_prob, 1)] == lda_pred
)

np.True_

In [34]:
qda = QDA(store_covariance=True)
qda.fit(X_train, D_train)

0,1,2
,priors,
,reg_param,0.0
,store_covariance,True
,tol,0.0001


In [35]:
qda.means_ , qda.priors_

(array([[ 0.04279022,  0.03389409],
        [-0.03954635, -0.03132544]]),
 array([0.49198397, 0.50801603]))

In [37]:
qda.covariance_[0]

array([[ 1.50662277, -0.03924806],
       [-0.03924806,  1.53559498]])

In [39]:
qda_pred = qda.predict(X_test)
confusion_table(qda_pred , D_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,30,20
Up,81,121


In [41]:
np.mean(qda_pred == D_test)

np.float64(0.5992063492063492)