In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn import linear_model

In [None]:
np.random.seed(23)

### Hetereskedasticnost
Heteroskedasticnost je pojava koja opisuje porast varijanse ciljne promenljive sa porastom vrednosti atributa.

<img src='heteroscedasticity.jpg'>

Osnovna pretpostavka u modelovanju linearnom regresijom je da je varijansa ciljne promenljive konstantna tj. da je skup podataka sa kojim se radi **homoskedastican**:

<img src='homoscedasticity.png'>

In [None]:
N = 200

In [None]:
x = np.random.normal(0, 9, N) 
noise = np.random.normal(0, 1+0.5*x**2, N) # kvadratni rast varijanse

Ciljna promenljiva: $y=3-2x$ uz dodatak suma

In [None]:
y = 3 - 2*x + noise

In [None]:
plt.scatter(x, y)

In [None]:
model = linear_model.LinearRegression()
model.fit(x.reshape(-1, 1), y)

In [None]:
b0 = model.intercept_
b1 = model.coef_[0]

In [None]:
b0, b1

In [None]:
y_model = model.predict(x.reshape(-1, 1))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(x, y, 'o', color = 'orange')
plt.plot(x, 3-2*x, color = 'green', label='Stvarni podaci')
plt.plot(x, y_model, color = 'blue', label='Predikcija')
# plt.plot(x, b0+b1*x, color = 'blue', label='Predikcija')
plt.legend(loc='best')
plt.show()

### Reziduali

In [None]:
residuals = y-y_model

In [None]:
plt.scatter(x, residuals)
plt.title('Grafik reziduala')
plt.show()

### Linearni model sa tezinama

**Instancama** kod kojih je varijansa mala dajemo vece tezine kako bi model bio sigurniji, a instancama kod kojih je varijansa velika dajemo manje tezine.

Umesto kvadratne sume reziduala minimizuje se njihova ponderisana suma: $$L(\beta, \omega) = \sum_{i=1}^{N}\omega_i(y_i-x_i\beta)^2$$ 

In [None]:
improved_model = linear_model.LinearRegression()

Tezina svake instance je reciprocna vrednosti varijanse u toj tacki: $w_i = \frac{1}{\sigma_i^2}$.

In [None]:
weights=1/(1+0.5*x**2)

In [None]:
# problem: odrediti tezine

In [None]:
improved_model.fit(x.reshape(-1,1) , y, sample_weight=weights) # tezine idu u argument sample_weights

In [None]:
b0_improved = improved_model.intercept_
b1_improved = improved_model.coef_[0]

In [None]:
b0_improved, b1_improved

In [None]:
y_improved_model = improved_model.predict(x.reshape(-1, 1))

In [None]:
plt.figure(figsize=(10,6))

plt.plot(x, y, 'o', color = 'orange')
plt.plot(x, 3-2*x, color = 'green', label='Stvarni podaci')
plt.plot(x, y_model, color = 'blue', label='Model')
plt.plot(x, y_improved_model, color = 'red', label='Improved model')

plt.legend(loc='best')
plt.show()

### Polinomi i interakcije

Ciljna promenljiva: $y$ koja je oblika $$y = \beta_0 + \beta_1X + \beta_2X^2 + \ldots + \beta_mX^m$$ 

In [None]:
x = np.linspace(-3, 3, N)
y = 2 + x**2 + np.random.normal(0, 1, N)
plt.scatter(x, y)
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

Pravimo zavisnost do drugog stepena: $$y = \beta_0 + \beta_1x + \beta_2x^2$$

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False) # set to False because linear regression will take care of that

In [None]:
poly

In [None]:
poly.fit(x.reshape(-1, 1))

In [None]:
poly_features = poly.transform(x.reshape(-1, 1))

In [None]:
poly_features

In [None]:
poly_model = linear_model.LinearRegression()

In [None]:
poly_model.fit(poly_features, y)

In [None]:
y_predicted = poly_model.predict(poly_features)

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(x, y)
plt.scatter(x, y_predicted, c="red")
plt.show()

In [None]:
# problem: odrediti stepen u jednacini zavisnosti (pazi na overfit)

In [None]:
x1 = np.linspace(-3, 3, N) * np.random.normal(1,0.1,N)
x2 = np.linspace(-3, 3, N) * np.random.normal(1,0.1,N)

In [None]:
# pravimo interakcije izmedju atributa tako da dobijemo nove [x1,x2,x1*x2]

In [None]:
X = np.transpose(np.vstack([x1, x2])) 

In [None]:
X.shape

In [None]:
poly = PolynomialFeatures(interaction_only=True,include_bias = False)
poly.fit_transform(X)

In [None]:
poly.fit_transform(X).shape

### Napomena: sve je analogno za slucaj linearne regresije sa proizvoljnim brojem atributa!