<a href="https://colab.research.google.com/github/UEPP40/PUM/blob/kaperob/linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
salary_df = pd.read_csv(r'/content/Salary Data (2).csv')

In [3]:
salary_df.shape

(375, 6)

In [4]:
for i in salary_df.columns:
  print(salary_df[i].value_counts())
  print('*'*50)

33.0    24
29.0    23
35.0    22
31.0    21
44.0    21
36.0    20
34.0    17
45.0    17
38.0    15
30.0    15
47.0    15
40.0    13
28.0    13
43.0    12
37.0    12
41.0    12
32.0    12
39.0    12
42.0    11
46.0    10
27.0     9
48.0     9
50.0     8
49.0     8
26.0     7
51.0     5
25.0     4
52.0     3
24.0     1
23.0     1
53.0     1
Name: Age, dtype: int64
**************************************************
Male      194
Female    179
Name: Gender, dtype: int64
**************************************************
Bachelor's    224
Master's       98
PhD            51
Name: Education Level, dtype: int64
**************************************************
Director of Marketing              12
Director of Operations             11
Senior Business Analyst            10
Senior Marketing Analyst            9
Senior Marketing Manager            9
                                   ..
Business Development Manager        1
Customer Service Representative     1
IT Manager                       

In [5]:
salary_df.isna().any()

Age                    True
Gender                 True
Education Level        True
Job Title              True
Years of Experience    True
Salary                 True
dtype: bool

In [6]:
salary_df.fillna(method='pad', inplace=True)

In [7]:
salary_df[['Age','Years of Experience']] = salary_df[['Age','Years of Experience']].apply(lambda x: x.astype(int))

In [8]:
age_mean = salary_df[(salary_df['Age'] <= 100) & (salary_df['Age'] >= 1)]['Age'].mean()

In [9]:
salary_df['Age'] = salary_df['Age'].fillna(age_mean)
salary_df.loc[salary_df['Age'] >= 100, 'Age'] = age_mean
salary_df.loc[salary_df['Age'] < 1, 'Age'] = age_mean

In [10]:
le = LabelEncoder()

In [11]:
columns_to_code = ['Gender','Education Level','Job Title']

# dzięki apply możemy za jednym wywołaniem kodu wykonać daną operację na każdym elemencie z listy
salary_df[columns_to_code] = salary_df[columns_to_code].apply(LabelEncoder().fit_transform)

In [12]:
x_train, x_test, y_train, y_test = train_test_split( salary_df[['Age','Gender','Education Level','Job Title','Years of Experience']], salary_df['Salary'], test_size=0.33, random_state=42)

In [13]:
reg = LinearRegression().fit(x_train, y_train)
y_przewidywania =reg.predict(x_test)

In [14]:
mean_absolute_error(y_test,y_przewidywania)

10655.892049878197

In [15]:
median_absolute_error(y_test,y_przewidywania)

7107.919661770196

In [16]:
huber = HuberRegressor().fit(x_train,y_train)
huber.score(x_train,y_train)
huber.coef_
huber.intercept_
y_przwidywana_huber = huber.predict(x_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [17]:

y_prawdziwe = y_test
print ("regresja liniowa:",  mean_absolute_error(y_prawdziwe,y_przewidywania))
print("regresja Hubera",mean_absolute_error(y_prawdziwe,y_przwidywana_huber))


regresja liniowa: 10655.892049878197
regresja Hubera 10281.018356691475


In [18]:

y_prawdziwe = y_test
print ("regresja liniowa:",  median_absolute_error(y_prawdziwe,y_przewidywania))
print("regresja Hubera",median_absolute_error(y_prawdziwe,y_przwidywana_huber))

regresja liniowa: 7107.919661770196
regresja Hubera 6598.790656351895


In [19]:
y_prawdziwe = y_test
print("regresja liniowa", r2_score(y_prawdziwe,y_przewidywania))
print("regresja Hubra",r2_score(y_prawdziwe,y_przwidywana_huber))

regresja liniowa 0.8924678720454159
regresja Hubra 0.8982653114109581


In [21]:
# calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = salary_df.columns
vif_data["VIF"] = [variance_inflation_factor(salary_df.values, i) for i in range(len(salary_df.columns))]
vif_data

Unnamed: 0,feature,VIF
0,Age,21.337334
1,Gender,2.184715
2,Education Level,2.962684
3,Job Title,4.894133
4,Years of Experience,23.587218
5,Salary,52.033087
