In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn import model_selection
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics

In [None]:
df = pd.read_excel('salary.xlsx', sheet_name='Sheet2')
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df['Gender'].value_counts()

In [None]:
df.groupby('Gender').agg(['mean', 'median'])

In [None]:
sns.boxplot(y='Age', x='Gender', data=df)

In [None]:
sns.boxplot(y='YearsExperience', x='Gender', data=df)

In [None]:
sns.pairplot(df, x_vars=['Age', 'YearsExperience'], y_vars=['Salary'], hue='Gender')

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(), annot=True, lw=1)

In [None]:
sns.boxplot(y='Salary', x='Classification', data=df)

In [None]:
sns.boxplot(y='Salary', x='Job', data=df)

### Dummy variables

In [None]:
X = df[['Age', 'YearsExperience', 'Gender', 'Classification', 'Job']]

In [None]:
X.shape

In [None]:
y = df['Salary']

In [None]:
X = pd.get_dummies(data=X, drop_first=True)
X.head()

In [None]:
df['Classification'].value_counts()

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.4, random_state=101)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train

In [None]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.coef_, model.intercept_

In [None]:
for feature, coef in zip(X.columns, model.coef_.ravel()):
    print (feature, coef)

### Interpretacija:
Koeficijent koji je vezan za *Gender* je oznacen kao "Gender_Male", jer smo napravili dummy varijablu koja je 1 ukoliko je Gender "Male", u suprotnom 0. Za vrednost "Female" ne pravi posebnu promenljivu jer je jednoznacno odredjena, kao i njen koeficijent. <br>
**Interpretacija ovog koeficijenta je:** <br> Ako imamo dve instance za koje je vrednost ostalih prediktora ista, dok je prva zenskog, a druga muskog pola, tada ce vrednost druge ciljne promenljive Salary biti za ~655 (koeficijent uz "Gender_Male" je -655) manja. U sustini, ovime pravimo dve regresione prave, jednu za "Female"  i jednu za "Male", koje su paralelne.

In [None]:
predictions = model.predict(X_test)
predictions

In [None]:
sns.regplot(x=y_test, y=predictions)

In [None]:
plt.scatter(y_test, predictions, c=X_test.Gender_Male, label="Male")
plt.legend(loc="upper left")

In [None]:
sns.regplot(x=y_test, y=predictions)

### Statisticke mere znacajnosti

[OLS](https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html)

In [None]:
import statsmodels.api as sm

In [None]:
ls = sm.OLS(y_train, sm.add_constant(X_train)).fit()
print(ls.summary())

### P-value


*Definition:* A p-value is a statistical measurement used to validate a hypothesis against observed data. A p-value measures the probability of obtaining the observed results, assuming that the null hypothesis is true. The lower the p-value, the greater the statistical significance of the observed difference. If the p-value is below your threshold of significance (typically p < 0.05), then you can reject the null hypothesis, but this does not necessarily mean that your alternative hypothesis is true.

<img src='p-value.jpg'>

### T-test
Testiramo hipotezu $$H0: b_1 = 0$$ protiv alternative $$H1: b_1 \neq 0$$ tj. testiramo da li postoji nekakva linearna vezaizmedju x i y.

### F-test
Testiramo hipotezu $$H0: b_1,b_2,...,b_n = 0,$$ protiv $$H1: \text{bar  jedan  od  njih}  \neq 0$$ tj. testiramo ukupnu znacajnost svih prediktora.

### Adjusted R-squared
We use **adjusted R-squared** to compare the goodness-of-fit for regression models that contain different numbers of independent variables.

Let’s say you are comparing a model with five independent variables to a model with one variable and the five variable model has a higher R-squared. Is the model with five variables actually a better model, or does it just have more variables? To determine this, just compare the adjusted R-squared values!

### Zakljucak iz primera:
* Koeficijent determinacije nam kaze da smo objasnili 97% varijanse u podacima
* p-vrednost F-testa (mera `Prob (F-statistic)`) je jako mala pa nam govori da odbacimo nultu hipotezu i da postoje neki znacajni prediktori u nasim podacima
* posmatrajuci p-vrednost T-testa (`P>|t|`) za svaki prediktor vidimo da je *YearsOfExperince* statisticki ubedljivo najznacajnini
* *Gender* nam nije statisticki znacajan prediktor, kada bi bio ili kada bismo ga zadrzali, imali bismo dva linearna modela, za svaki pol posebno, koja znamo da interpretiramo