In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.random import normal
from numpy.linalg import norm
from scipy.stats import beta
from scipy.special import digamma
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, root_mean_squared_error, r2_score
from sklearn.linear_model import SGDRegressor

from RMSProp import SGD_RMSProp
from BetaRegression import *

# Проверка SGD RMSProp на данных в модели бета-регрессии 

In [2]:
df = pd.read_csv("./Data/observations.csv", sep=";").drop("Cover_class", axis = 1)
df.head()

Unnamed: 0,Site_id,Visit_id,Species_id,Cover
0,4,6345,2338,2
1,4,16199,2338,3
2,4,28382,2338,3
3,4,40703,2338,2
4,4,49283,2338,1


In [3]:
X, y = df.drop("Cover", axis=1), df["Cover"] / 100
nrow, ncol = X.shape

### Стандартизация данных

In [4]:
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

### Бета-регрессия:

In [None]:
%%time

np.random.seed(42)

# beta_start = normal(0, 1, ncol) ** 2
# phi_start = normal(0, 1, 1) ** 2
mean_y_train = y_train.mean()
phi_y_train = (mean_y_train * (1 - mean_y_train)) / y_train.var() - 1

beta_start = normal(0, 1, 4)
phi_start = phi_y_train
start_point = np.append(beta_start, phi_start)

X_train_ones = np.hstack([X_train_s, np.ones((y_train.size, 1))])
X_test_ones = np.hstack([X_test_s, np.ones((y_test.size, 1))])

beta_res = SGD_RMSProp(
    start=start_point,
    X=X_train_ones,
    y=y_train.to_numpy(),
    L_grad=beta_illh_grad,
    L=beta_inv_log_likelihood,
    batch_size=50,
    use_epoch = True,
    learning_rate=0.01,
    decay_rate=0.9,
    max_iter=10000,
    tol=1e-4,
    link_inverse=logit_inverse,
    link_deriv=logit_deriv,
)

CPU times: user 305 ms, sys: 920 μs, total: 306 ms
Wall time: 312 ms


In [6]:
coef = beta_res["point"][:-1]
phi = beta_res["point"][-1]
L_model = -beta_res["L_value"]

print(f"""
coef: {coef}
phi: {phi}
gradient norm: {norm(beta_res["grad_value"])}
loss function value: {beta_res["L_value"]}
iteratons: {beta_res["iterations"]}
""")


coef: [ 0.00424009 -0.00172009 -0.24513978 -1.69793853]
phi: 3.2336158881603354
gradient norm: 4.813875940313717
loss function value: -4694.91699669568
iteratons: 4488



### Коэффициенты и минус логарифм функции правдоподобия для них, полученные функцией `betareg` в R
Параметры `betareg` были взяты по умолчанию.

In [None]:
R_betareg_params = np.array([0.0006665, 0.0074393, -0.2369492, -1.7240728, 3.3285])
good_beta_means_pred = logit_inverse(X_test_ones, R_betareg_params[:-1])
print(f"""coef: {R_betareg_params[:-1]}
phi: {R_betareg_params[-1]}
loss function value: {beta_inv_log_likelihood(R_betareg_params, X_train_ones, y_train, link_inverse=logit_inverse)}
""")

coef: [ 6.6650000e-04  7.4393000e-03 -2.3694920e-01 -1.7240728e+00]
phi: 3.3285
loss function value: -4693.180611309426



### Проверка качества модели
1.  **Псевдо $R^2$**:
    * Cox and Snell: $$ R^2_{CS} = 1 - \exp\left( \frac{2}{n} (L_0 - L_M) \right).$$
    * McFadden: $$R^2_{MF} = 1 - \frac{L_M}{L_0}.$$

$L_0$ - логарифм функции правдоподобия модели с константными предсказаниями ($y_i = \overline{y}_\text{train}$ для всех $i$, $\varphi$ оценивается по тренировочной выборке).\
$L_M$ - логарифм функции правдоподобия полученной модели.

2.  **RMSE**:
$$
    \sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - y_i^*)^2},
$$
$y_i$ - значение признака, $y_i^*$ - его предсказание по модели.

In [None]:
# Предсказания средних на тестовой выборке
beta_means_pred = logit_inverse(X_test_ones, coef)
beta_var_pred = beta_means_pred * (1 - beta_means_pred) / (1 + phi)


In [None]:
L0 = np.sum(
    np.log(gamma(phi_y_train))
    - np.log(gamma(mean_y_train * phi_y_train))
    - np.log(gamma(phi_y_train * (1 - mean_y_train)))
    + (mean_y_train * phi_y_train - 1) * np.log(y_test)
    + (phi_y_train * (1 - mean_y_train * phi_y_train) - 1) * np.log(1 - y_test)
)

L_beta = beta_log_likelihood(parameters=beta_res["point"], X=X_test_ones, y=y_test, link_inverse=logit_inverse)
pseudo_r2_mf = 1 - L_beta / L0
pseudo_r2_cs = 1 - np.exp(2 / y_test.size * (L0 - L_beta))

print(f'Pseudo R^2 (MF) = {pseudo_r2_mf}\nPseudo R^2 (CS) = {pseudo_r2_cs}')

Pseudo R^2 (MF) = -0.16608770005929951
Pseudo R^2 (CS) = 0.2584963953251541


In [16]:
print(f"""Beta prediction RMSE = {root_mean_squared_error(y_test, beta_means_pred)}
R betareg prediction RMSE = {root_mean_squared_error(y_test, good_beta_means_pred)}
Mean prediction RMSE = {root_mean_squared_error(y_test, np.repeat(y_train.mean(), y_test.size))}
""")

Beta prediction RMSE = 0.20554915737626148
R betareg prediction RMSE = 0.20556753591460283
Mean prediction RMSE = 0.21217523641176173



In [64]:
# Остатки Пирсона модели на тестовой выборке (учитывают гетероскедастичность)
beta_resids = beta_means_pred - y_test
beta_resids_pearson = beta_resids / np.sqrt(beta_var_pred)

print(f"""Mean of residuals: {beta_resids_pearson.mean()}
Variance of residuals: {beta_resids_pearson.var()}
Quantiles of residuals: {np.quantile(beta_resids_pearson, [0, 0.25, 0.5, 0.75, 1])}
""")

Mean of residuals: 0.11247243652425895
Variance of residuals: 1.2592182722710008
Quantiles of residuals: [-4.23899052 -0.02116896  0.57716173  0.83889686  0.90158623]



In [39]:
# mu = beta_means_pred
# a, b = mu * phi, (1 - mu) * phi
# fig, ax = plt.subplots(1, 1)
# x = np.linspace(beta.ppf(0, a, b),
#                 beta.ppf(1, a, b), 100)
# ax.plot(x, beta.pdf(x, a, b), 'r-', lw=0.1, alpha=0.6);

### Линейная регрессия

In [65]:
lin_model = SGDRegressor(alpha=0, learning_rate="adaptive", eta0=0.01, max_iter=6000, tol=1e-4, random_state=42)

lin_model.fit(X_train_s, y_train)

In [69]:
lin_pred = lin_model.predict(X_test_s)
print(f"RMSE for linear model: {root_mean_squared_error(y_test, lin_pred)}")

RMSE for linear model: 0.2018980189709539


In [70]:
x_new = np.array([2, -1, 4, 1]).reshape((1, 4))
print(f"""Linear model prediction: {lin_model.predict(x_new[:,:-1])}
Beta regression model prediction: {logit_inverse(x_new, coef)}
""")

Linear model prediction: [-0.11559332]
Beta regression model prediction: [0.06487018]

