In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", 15)
pd.set_option("display.max_columns", 10)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.float_format = '{:.4f}'.format

In [2]:
# 讀取資料
df1 = pd.read_csv("data1.csv")
df2 = pd.read_csv("data2.csv")
df1["S2"] = df1["S"]**2
df2["S2"] = df2["S"]**2
df1.head(3)

Unnamed: 0,S,K,r,q,sigma,T,Call,Put,Forward,S2
0,20,80,0.0,0.05,0.1,0.0,0.0,60.0,20.0,400
1,20,80,0.0,0.05,0.1,0.1,0.0,60.0998,19.9002,400
2,20,80,0.0,0.05,0.1,0.2,0.0,60.199,19.801,400


In [3]:
def print_range(df):
    print("S:", df["S"].min(), df["S"].max())
    print("K:", df["K"].min(), df["K"].max())
    print("T:", df["T"].min(), df["T"].max())
    print("r:", df["r"].min(), df["r"].max())
    print("sigma:", df["sigma"].min(), df["sigma"].max())

# **1. 試驗一 是否要加入Forward**

a. 利用data1.csv，完成以下兩個模型，觀察所有係數的p值、t統計量，並計算Forward的VIF。範例使用Call，**作業請利用Put完成。**

b. 你認為哪個模型較佳? 為什麼?

$$
\begin{align}
model \space 1: \space C, P  &= \beta_{0} + \beta_{1} \space S  
 + \beta_{2} \space K
 + \beta_{3} \space T
 + \beta_{4} \space sigma
 + \beta_{5} \space r \\
 
 model \space 2: \space C, P  &= \beta_{0} + \beta_{1} \space S  
 + \beta_{2} \space K
 + \beta_{3} \space T
 + \beta_{4} \space sigma
 + \beta_{5} \space r
 + \beta_{6} \space Forward 

 \end{align} 
 $$


In [4]:
test1 = df1[:]
print_range(test1)

# 試驗一 - model 1
model1 = sm.OLS(test1["Call"], sm.add_constant( test1[["S","K","T","sigma","r"]] )).fit()
print(model1.summary())

S: 20 180
K: 80 120
T: 0.0 2.0
r: 0.0 0.1
sigma: 0.1 0.5
                            OLS Regression Results                            
Dep. Variable:                   Call   R-squared:                       0.848
Model:                            OLS   Adj. R-squared:                  0.848
Method:                 Least Squares   F-statistic:                 8.765e+05
Date:                Thu, 29 Feb 2024   Prob (F-statistic):               0.00
Time:                        03:26:23   Log-Likelihood:            -2.9442e+06
No. Observations:              785862   AIC:                         5.888e+06
Df Residuals:                  785856   BIC:                         5.889e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [5]:
# 試驗一 - model 2
model2 = sm.OLS(test1["Call"], sm.add_constant( test1[["S","K","T","sigma","r","Forward"]] )).fit()
summary = model2.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                   Call   R-squared:                       0.851
Model:                            OLS   Adj. R-squared:                  0.851
Method:                 Least Squares   F-statistic:                 7.462e+05
Date:                Thu, 29 Feb 2024   Prob (F-statistic):               0.00
Time:                        03:26:24   Log-Likelihood:            -2.9371e+06
No. Observations:              785862   AIC:                         5.874e+06
Df Residuals:                  785855   BIC:                         5.874e+06
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.7869      0.102    105.405      0.0

$$ VIF = \frac{1}{1-R^2}$$

In [6]:
# 計算VIF
# Y: Forward
# X: S, K, T, sigma, r
modelvif = sm.OLS(test1["Forward"], sm.add_constant( test1[["S","K","T","sigma","r"]] )).fit()
summary = modelvif.summary()
print("* Forward VIF:", 1/(1-modelvif.rsquared))
print(summary)

* Forward VIF: 289.1046565315515
                            OLS Regression Results                            
Dep. Variable:                Forward   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.997
Method:                 Least Squares   F-statistic:                 4.528e+07
Date:                Thu, 29 Feb 2024   Prob (F-statistic):               0.00
Time:                        03:26:24   Log-Likelihood:            -1.9282e+06
No. Observations:              785862   AIC:                         3.857e+06
Df Residuals:                  785856   BIC:                         3.857e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.12

# **2. 試驗二 是否要加入S^2**

* 2.1 S 範圍窄
* 2.2 S 範圍廣


## 2.1 S範圍窄

a. 利用data2.csv，該資料集S介於100~102，完成以下兩個模型，觀察所有係數的p值、t統計量，並計算S^2的VIF。範例使用Call，**作業請利用Put完成。**

b. 觀察S的beta係數，在加入S2前後有什麼變化，數值是否合理? 為什麼?

c. 你認為哪個模型較佳? 為什麼?


$$
\begin{align}
model \space 1: \space C, P  &= \beta_{0} + \beta_{1} \space S  
 + \beta_{2} \space K
 + \beta_{3} \space T
 + \beta_{4} \space sigma
 + \beta_{5} \space r \\
 
 model \space 2: \space C, P  &= \beta_{0} + \beta_{1} \space S  
 + \beta_{2} \space K
 + \beta_{3} \space T
 + \beta_{4} \space sigma
 + \beta_{5} \space r
 + \beta_{6} \space S^2 

 \end{align} 
$$


In [7]:
# 試驗二 - model 1
test2 = df2[:]
print(test2.shape)
print_range(test2)
model1 = sm.OLS(test2["Call"], sm.add_constant( test2[["S","K","T","sigma","r"]] )).fit()
summary = model1.summary()
print(summary)

(261954, 10)
S: 100.0 102.0
K: 80 120
T: 0.0 2.0
r: 0.0 0.1
sigma: 0.1 0.5
                            OLS Regression Results                            
Dep. Variable:                   Call   R-squared:                       0.903
Model:                            OLS   Adj. R-squared:                  0.903
Method:                 Least Squares   F-statistic:                 4.877e+05
Date:                Thu, 29 Feb 2024   Prob (F-statistic):               0.00
Time:                        03:26:24   Log-Likelihood:            -6.1035e+05
No. Observations:              261954   AIC:                         1.221e+06
Df Residuals:                  261948   BIC:                         1.221e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------

In [8]:
# 試驗二 - model 2 加入S2
model2 = sm.OLS(test2["Call"], sm.add_constant( test2[["S","K","T","sigma","r","S2"]] )).fit()
summary = model2.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                   Call   R-squared:                       0.903
Model:                            OLS   Adj. R-squared:                  0.903
Method:                 Least Squares   F-statistic:                 4.064e+05
Date:                Thu, 29 Feb 2024   Prob (F-statistic):               0.00
Time:                        03:26:24   Log-Likelihood:            -6.1035e+05
No. Observations:              261954   AIC:                         1.221e+06
Df Residuals:                  261947   BIC:                         1.221e+06
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         48.8164    140.311      0.348      0.7

In [9]:
# 計算VIF
# Y: Forward
# X: S、K、T、sigma
modelvif = sm.OLS(test2["S2"], sm.add_constant( test2[["S","K","T","sigma","r"]] )).fit()
summary = modelvif.summary()
print("VIF:", 1/(1-modelvif.rsquared))
print(summary)

VIF: 130783.05128155186
                            OLS Regression Results                            
Dep. Variable:                     S2   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 6.852e+09
Date:                Thu, 29 Feb 2024   Prob (F-statistic):               0.00
Time:                        03:26:24   Log-Likelihood:                -99128.
No. Observations:              261954   AIC:                         1.983e+05
Df Residuals:                  261948   BIC:                         1.983e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -1.02e+04      0

## 2.2 S範圍廣

a. 利用 data1.csv，再做一次，Y使用Put完成以下兩個模型，觀察所有係數的p值、t統計量，並計算S^2的VIF。

b. 你認為哪個模型較佳? 為什麼?



$$
\begin{align}
model \space 1: \space C, P  &= \beta_{0} + \beta_{1} \space S  
 + \beta_{2} \space K
 + \beta_{3} \space T
 + \beta_{4} \space sigma
 + \beta_{5} \space r \\
 
 model \space 2: \space C, P  &= \beta_{0} + \beta_{1} \space S  
 + \beta_{2} \space K
 + \beta_{3} \space T
 + \beta_{4} \space sigma
 + \beta_{5} \space r
 + \beta_{6} \space S^2 

 \end{align} 
$$


In [10]:
# 試驗二 - model 1
test2 = df1[:]
print(test2.shape)
print_range(test2)

(785862, 10)
S: 20 180
K: 80 120
T: 0.0 2.0
r: 0.0 0.1
sigma: 0.1 0.5


In [11]:
# 試驗二 - model 2 加入S2


In [12]:
# 計算VIF


# **3. 試驗三 TXO sigma選擇**

利用**TXO Put**資料，篩選樣本固定K=11000、T>= 0.1，完成以下七種模型(三種波動率組合放進去X)，計算全部X的VIF，綜合考量VIF、共線性、R2，你認為哪個模型較佳?

<img src="試驗三.png">

## Put

In [14]:
# 篩選特定類型的 TXO Put
TXO = pd.read_csv("TXOPut.csv")
TXO["Date"] = pd.to_datetime(TXO["Date"])
filt = TXO.query("K==11000 & T >= 0.1")
print(filt.shape)
filt.head(2)

(997, 11)


Unnamed: 0,Put,S,HV,IV,T,...,K,Type,Date,TVIX,S2
69,18.0,12100.48,0.1079,0.167,0.1342,...,11000.0,P,2020-01-02,0.1432,146421616.2304
70,42.0,12100.48,0.1079,0.1637,0.211,...,11000.0,P,2020-01-02,0.1432,146421616.2304
