# Assignment 1

##### Created by Qihang Ma -- 2023.01.25

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew, t, ttest_1samp
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats
from scipy.optimize import minimize 
import warnings
warnings.filterwarnings("ignore")

## Problem 1 - Prove the skewness and kurtosis are biased or not

STEPS:
1. Sample 100, 1,000, 10,000 standardized random normal values.
2. Calculate the skewness, kurtosis
3. Sample the skewness, kurtosis by repeating steps 1 and 2 1000 times.
4. Calculate the mean kurtosis and standard deviation.
5. If the value is lower than threshold (typically 5%), then reject the hypothesis that the kurtosis function is unbiased.

*Prove that the skewness and kurtosis are biased when the sample is small and unbiased when the sample is large.

In [2]:
def TestSkewness(sample_num): 
    samples = 1000

    skewness = np.empty(samples)
    for i in range(samples):
        skewness[i] = skew(np.random.normal(0, 1,sample_num))

    print("The mean Skewness is {}.\nThe standard deviation is {}." .format(np.mean(skewness), np.std(skewness)))

    t_stat, p_val = ttest_1samp(skewness, popmean=0)

    alpha = 0.05

    if p_val < alpha:
        print("For skewness, the calculated p-value is {}.\nReject the null hypothesis.\n" .format(p_val))
    else:
        print("For skewness, the calculated p-value is {}.\nFail to reject the null hypothesis.\n" .format(p_val))
    return

def TestKurtosis(sample_num):
    samples = 1000

    kurts = np.empty(samples)
    for i in range(samples):
        kurts[i] = kurtosis(np.random.normal(0, 1,sample_num))

    print("The mean kurtosis is {}.\nThe standard deviation is {}." .format(np.mean(kurts), np.std(kurts)))

    t_stat, p_val = ttest_1samp(kurts, popmean=0)

    alpha = 0.05

    if p_val < alpha:
        print("For kurts, the calculated p-value is {}.\nReject the null hypothesis.\n" .format(p_val))
    else:
        print("For kurts, the calculated p-value is {}.\nFail to reject the null hypothesis.\n" .format(p_val))
    return

In [3]:
sample = [100,1000,10000]
for i in sample:
    print("For {} samples: " .format(i))
    TestSkewness(i)
    TestKurtosis(i)

For 100 samples: 


NameError: name 'kurts' is not defined

## Problem 2 - OLS, MLE, Comparation

Fit the data in problem2.csv using OLS and calculate the error vector. Look at its distribution. How well does it fit the assumption of normally distributed errors?

Fit the data using MLE given the assumption of normality. Then fit the MLE using the assumption of a T distribution of the errors. Which is the best fit?

What are the fitted parameters of each and how do they compare? What does this tell us about the breaking of the normality assumption in regards to expected values in this case? 

In [None]:
data = pd.read_csv('problem2.csv')
data['constant'] = 1
x = data['x']
X = data[['constant','x']]
y = data ['y']

### Fit with OLS

In [None]:
ols_model = sm.OLS(y,X).fit()
ols_model.summary()

In [None]:
error = ols_model.resid
plt.hist(error, bins = 20, density = True)
x_axis = np.linspace(-6, 6, 100)
plt.plot(x_axis, norm.pdf(x_axis, 0,1))

print("The Skewness of error is {}.\nThe Kurtosis of error is {}." .format(skew(error),kurtosis(error)))

### Fit with MLE given the assumption of normality 

In [None]:
# define likelihood function
def MLE_Norm(params, x, y):
    yhat = params[0] + params[1]*x # predictions
    negLL = -1 * np.sum(stats.norm.logpdf(y, yhat, params[2]))
    return(negLL)

In [None]:
results_norm = minimize(MLE_Norm, x0=(1,1,1), args=(x, y))
results_norm

### Fit wit MLE given the assumption of t-distribution 

In [None]:
# define likelihood function
def MLE_T(params, x, y):
    yhat = params[0] + params[1]*x # predictions
    negLL = -1 * np.sum(stats.t.logpdf(y-yhat, params[2], scale=params[3]))
    return(negLL)

In [None]:
# let’s start with some random coefficient guesses and optimize
results_t = minimize(MLE_T, x0=(1,1,1,1), args=(x,y))
results_t

### Goodness of fit

In [None]:
def R_square(x, y, intercept, beta):   
    y_predicted = intercept + beta * x
    y_mean = np.mean(y)
    error = y - y_predicted
    ss_tot = sum((y - y_mean) ** 2)
    ss_res = sum((error - np.mean(error)) ** 2)
    r_squared = 1 - (ss_res / ss_tot)

    return r_squared

In [None]:
r_square_Norm = R_square(x,y,results_norm.x[0],results_norm.x[1])
r_square_T = R_square(x,y,results_t.x[0],results_t.x[1])

print("The R-square for model fitted with MLE given the assumption of Normal Distribution is {}." .format(r_square_Norm))
print("The R-square for model fitted with MLE given the assumption of T Distribution is {}." .format(r_square_T))

### Information Criteria

In [None]:
def cal_info_criteria(x, k, loglik):
    AIC = 2 * k + 2 * loglik
    BIC = k * np.log(len(x)) + 2 * loglik
    return AIC, BIC

In [None]:
AIC_N, BIC_N = cal_info_criteria(x, 2, results_norm.fun)
AIC_T, BIC_T = cal_info_criteria(x, 2, results_t.fun)
print("AIC for Normal distribution is: {}. BIC: {}".format(AIC_N, BIC_N))
print("AIC for T distribution is: {}. BIC: {}".format(AIC_T, BIC_T))

## Problem 3 - AR, MR Stimulation

Simulate AR(1) through AR(3) and MA(1) through MA(3) processes. Compare their ACF and PACF graphs. How do the graphs help us to identify the type and order of each process?

In [None]:
np.random.seed(0)
nsample = 3000

# Simulate AR(1) process
ar1 = np.r_[1, 0.6]
ma1 = np.array([1])
ar1_data = sm.tsa.arma_generate_sample(ar=ar1, ma=ma1, nsample=nsample)

# Simulate AR(2) process
ar2 = np.r_[1, 0.6, 0.3]
ma2 = np.array([1])
ar2_data = sm.tsa.arma_generate_sample(ar=ar2, ma=ma2, nsample=nsample)

# Simulate AR(3) process
ar3 = np.r_[1, 0.6, 0.3, 0.2]
ma3 = np.array([1])
ar3_data = sm.tsa.arma_generate_sample(ar=ar3, ma=ma3, nsample=nsample)

# Simulate MA(1) process
ar4 = np.array([1])
ma4 = np.r_[1, 0.6]
ma1_data = sm.tsa.arma_generate_sample(ar=ar4, ma=ma4, nsample=nsample)

# Simulate MA(2) process
ar5 = np.array([1])
ma5 = np.r_[1, 0.6, 0.3]
ma2_data = sm.tsa.arma_generate_sample(ar=ar5, ma=ma5, nsample=nsample)

# Simulate MA(3) process
ar6 = np.array([1])
ma6 = np.r_[1, 0.6, 0.3, 0.2]
ma3_data = sm.tsa.arma_generate_sample(ar=ar6, ma=ma6, nsample=nsample)

# Plot data, ACF and PACF for each process
fig, axs = plt.subplots(6, 3, figsize=(30,45))
for i, data in enumerate([ar1_data, ar2_data, ar3_data, ma1_data, ma2_data, ma3_data]):
    axs[i, 0].plot(data)
    sm.graphics.tsa.plot_acf(data, lags=10, zero=False, ax=axs[i, 1])
    sm.graphics.tsa.plot_pacf(data, lags=10, zero=False, ax=axs[i, 2])
    axs[i, 0].set_title(f'Stimulation Data for process {i+1}')    
    axs[i, 1].set_title(f'ACF for process {i+1}')
    axs[i, 2].set_title(f'PACF for process {i+1}')
plt.show()
