In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Salary_Data.csv")

In [3]:
df.head()

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891


In [4]:
df.isnull().sum()

YearsExperience    0
Age                0
Salary             0
dtype: int64

## Spliting data

In [5]:
x = df.drop(['Salary'],axis=1)
y = df['Salary']

In [6]:
x.head()

Unnamed: 0,YearsExperience,Age
0,1.1,21.0
1,1.3,21.5
2,1.5,21.7
3,2.0,22.0
4,2.2,22.2


In [7]:
y.head()

0    39343
1    46205
2    37731
3    43525
4    39891
Name: Salary, dtype: int64

## Applying OLS and checking multi collinearity using OLS (Ordinary Least squares)

In [8]:
import statsmodels.api as sm

In [9]:
x = sm.add_constant(x)
# x['constant']  = 1.0

In [10]:
x.head()

Unnamed: 0,const,YearsExperience,Age
0,1.0,1.1,21.0
1,1.0,1.3,21.5
2,1.0,1.5,21.7
3,1.0,2.0,22.0
4,1.0,2.2,22.2


In [11]:
model = sm.OLS(y,x).fit()

In [12]:
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,323.9
Date:,"Thu, 16 Feb 2023",Prob (F-statistic):,1.35e-19
Time:,00:40:54,Log-Likelihood:,-300.35
No. Observations:,30,AIC:,606.7
Df Residuals:,27,BIC:,610.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6661.9872,2.28e+04,-0.292,0.773,-5.35e+04,4.02e+04
YearsExperience,6153.3533,2337.092,2.633,0.014,1358.037,1.09e+04
Age,1836.0136,1285.034,1.429,0.165,-800.659,4472.686

0,1,2,3
Omnibus:,2.695,Durbin-Watson:,1.711
Prob(Omnibus):,0.26,Jarque-Bera (JB):,1.975
Skew:,0.456,Prob(JB):,0.372
Kurtosis:,2.135,Cond. No.,626.0


**r2 square value is 0.960 that means model has fitted very well<br/>**
**P value of Age is more than 0.05 and year of experience also has p value (i.e not 0) , years of expirence and age has some kind of colleration(if it does has any correlation then p value will be 0)<br/>**
**std err of age and years of expirence is huge indicating multicollinearity<br/>**

In [13]:
df.corr()

Unnamed: 0,YearsExperience,Age,Salary
YearsExperience,1.0,0.987258,0.978242
Age,0.987258,1.0,0.97453
Salary,0.978242,0.97453,1.0


check independent variables i.e years of expirence and age , have a high correlation so remove the feature which has high p value ,This will not have much effect on the accuracy of model as the correlation is about 98%.

In [14]:
x = df.drop(['Salary','Age'],axis=1)
y = df['Salary']

In [15]:
x.head()

Unnamed: 0,YearsExperience
0,1.1
1,1.3
2,1.5
3,2.0
4,2.2


In [16]:
y.head()

0    39343
1    46205
2    37731
3    43525
4    39891
Name: Salary, dtype: int64

## Apply OLS

In [17]:
import statsmodels.api as sm

In [18]:
x =sm.add_constant(x)

In [19]:
model = sm.OLS(y,x).fit()

In [20]:
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.957
Model:,OLS,Adj. R-squared:,0.955
Method:,Least Squares,F-statistic:,622.5
Date:,"Thu, 16 Feb 2023",Prob (F-statistic):,1.14e-20
Time:,00:40:54,Log-Likelihood:,-301.44
No. Observations:,30,AIC:,606.9
Df Residuals:,28,BIC:,609.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.579e+04,2273.053,11.347,0.000,2.11e+04,3.04e+04
YearsExperience,9449.9623,378.755,24.950,0.000,8674.119,1.02e+04

0,1,2,3
Omnibus:,2.14,Durbin-Watson:,1.648
Prob(Omnibus):,0.343,Jarque-Bera (JB):,1.569
Skew:,0.363,Prob(JB):,0.456
Kurtosis:,2.147,Cond. No.,13.2


**now everything is fine , no multicollinearity problem**

## checking multi collinearity using Variance inflation factor (VIF)

**Vif is metrics that says how other variable are explaining your one variable,If vif value is greater than 5 then the other variables are explaining the target independent variable very well , so we remove because all other variable are explaining this variable<br/><br/>**
**salary is target variable so remove and check on all other independent variable<br/>**
**Any Variable whose vif value is greater than 5 we remove it<br/><br/>**
**example x1 is target feature and x1,x2,x3,x4 are independent variable then it says variance explained by x1 is captured very well by other variable hence we can remove it we do not need it <br/>**

In [21]:
for i in df.columns:
    if i == 'Salary':
        continue
    x= df.drop([i],axis=1)
    y= df[i]
    model = sm.OLS(y,x).fit()
    rsq = model.rsquared
    vif = round(1 / (1-rsq),2)
    print("R Square when model assume {} as target feature is {}".format(i,rsq))
    print("Variance Inflation factor when model assume {} as target feature is {}".format(i,vif))
    print("\n\n")
    

R Square when model assume YearsExperience as target feature is 0.985419108928445
Variance Inflation factor when model assume YearsExperience as target feature is 68.58



R Square when model assume Age as target feature is 0.9851791736560138
Variance Inflation factor when model assume Age as target feature is 67.47





**hence vif is high in both age and years of experience so remove any one of them.

## Or you can 'variance_inflation_factor' to check vif

In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [23]:
def calculate_vif(df):
    vif = pd.DataFrame()
    vif['feature'] = df.columns
    vif['VIF Values'] = [variance_inflation_factor(df.values,i) for i in range(df.shape[1]) ]
    
    return vif

In [24]:
features = df.iloc[:,:-1]
print(features.head())
calculate_vif(features)

   YearsExperience   Age
0              1.1  21.0
1              1.3  21.5
2              1.5  21.7
3              2.0  22.0
4              2.2  22.2


Unnamed: 0,feature,VIF Values
0,YearsExperience,11.24047
1,Age,11.24047


**Both features have high vif values so remove any one of them<br/><br/>**
    **which one to remove ?<br/> this can answerd by correlation the variable which has high correlation with target feature keep that and remove the other one**

In [25]:
df.corr()

Unnamed: 0,YearsExperience,Age,Salary
YearsExperience,1.0,0.987258,0.978242
Age,0.987258,1.0,0.97453
Salary,0.978242,0.97453,1.0


**Comparing correlation of 'Age' and 'yearsofexpirience' with target variable , 'yearsofexpirience' has corelation with target so keep that and remove 'age'**

In [26]:
x= df.drop(['Salary','Age'],axis=1)
y= df['Salary']
model = sm.OLS(y,x).fit()
rsq = model.rsquared
print("R Square when model assume {} as target feature and {} as independent feature is {}".format(i,'YearsExperience',rsq))

R Square when model assume Salary as target feature and YearsExperience as independent feature is 0.9730791806609133


## checking multi collinearity using Tolerence

**Can Also check multi collinearity with tolerence , if tolerence value must be greater than 0.2 if not then there is multi collinearity exists**

In [27]:
for i in df.columns:
    if i == 'Salary':
        continue
    x= df.drop([i],axis=1)
    y= df[i]
    model = sm.OLS(y,x).fit()
    rsq = model.rsquared
    vif = round((1-rsq),2)
    print("R Square when model assume {} as target feature is {}".format(i,rsq))
    print("Variance Inflation factor when model assume {} as target feature is {}".format(i,vif))
    print("\n\n")

R Square when model assume YearsExperience as target feature is 0.985419108928445
Variance Inflation factor when model assume YearsExperience as target feature is 0.01



R Square when model assume Age as target feature is 0.9851791736560138
Variance Inflation factor when model assume Age as target feature is 0.01





**Multi collinearity exists , can be handled using correlation like how we done for above**