In [2]:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats


In [3]:
print("""Answer to question #1:""")
data = {
    'Dependent': [35, 50, 65, 70, 80],
    'education': [12, 16, 18, 20, 21],
    'experience': [5, 10, 12, 15, 18],
    'Age': [25, 30, 32, 35, 40]
}

df = pd.DataFrame(data)
print(df)

Y=df["Dependent"]
X=df[["education", "experience", "Age"]]
X=sm.add_constant(X)
model = sm.OLS(Y, X).fit()
print(model.summary())
print("""-------------------------------------------------------------------------------------------------""")
print("""-------------------------------------------------------------------------------------------------""")
print(f"\nInterpretation:")
print("""In the abstract sense, the coefficient of x1 that is, the coefficient on education indicates that for every additional unit of education, the dependent variable y increases by approximately 15.8333 on avg""")

Answer to question #1:
   Dependent  education  experience  Age
0         35         12           5   25
1         50         16          10   30
2         65         18          12   32
3         70         20          15   35
4         80         21          18   40
                            OLS Regression Results                            
Dep. Variable:              Dependent   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.987
Method:                 Least Squares   F-statistic:                     99.67
Date:                Thu, 06 Nov 2025   Prob (F-statistic):             0.0735
Time:                        18:52:14   Log-Likelihood:                -6.6389
No. Observations:                   5   AIC:                             21.28
Df Residuals:                       1   BIC:                             19.72
Df Model:                           3                                         
Covariance Type:    

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [4]:
print("""Answer to question #2:""")
data = {
    'Dependent': [35, 50, 65, 70, 80],
    'education': [12, 16, 18, 20, 21],
    'experience': [5, 10, 12, 15, 18],
}

df = pd.DataFrame(data)
print(df)

Y=df["Dependent"]
X=df[["education", "experience"]]
X=sm.add_constant(X)
model = sm.OLS(Y, X).fit()
print(model.summary())
print("""-------------------------------------------------------------------------------------------------""")
print("""-------------------------------------------------------------------------------------------------""")
print(f"\nInterpretation:")
print("""When age is removed we get a coefficient of 2.976 on education when it was previously 15.83, this leads to believe that age and education were are positively correlated, and age was previously accounting for some of the variation in the dependent variable that education now partially absorbs. This illustrates the omitted variable bias, by showing that when you leave relevant vraiables such as age it can change the effects of other variables in your model such as with age here.""")

Answer to question #2:
   Dependent  education  experience
0         35         12           5
1         50         16          10
2         65         18          12
3         70         20          15
4         80         21          18
                            OLS Regression Results                            
Dep. Variable:              Dependent   R-squared:                       0.980
Model:                            OLS   Adj. R-squared:                  0.960
Method:                 Least Squares   F-statistic:                     49.34
Date:                Thu, 06 Nov 2025   Prob (F-statistic):             0.0199
Time:                        18:52:14   Log-Likelihood:                -11.101
No. Observations:                   5   AIC:                             28.20
Df Residuals:                       2   BIC:                             27.03
Df Model:                           2                                         
Covariance Type:            nonrobust             

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [5]:
print("""Answer to question #3:""")
data = {
    'Dependent': [35, 50, 65, 70, 80],
    'education': [12, 16, 18, 20, 21],
    'experience': [5, 10, 12, 15, 18],
    'Age': [25, 30, 32, 35, 40]
}

df = pd.DataFrame(data)
print(df)
Y=df["Dependent"]
X=df[["education", "experience", "Age"]]
X=sm.add_constant(X)
model = sm.OLS(Y, X).fit()
print("""-------------------------------------------------------------------------------------------------""")
print("""-------------------------------------------------------------------------------------------------""")

corr_matrix = df[['education', 'experience', 'Age']].corr()
print(corr_matrix)

print("""-------------------------------------------------------------------------------------------------""")
print("""-------------------------------------------------------------------------------------------------""")
X = sm.add_constant(df[['education', 'experience', 'Age']])
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i)
              for i in range(X.shape[1])]
print(vif)
print("""-------------------------------------------------------------------------------------------------""")
print("""-------------------------------------------------------------------------------------------------""")
print(f"\nInterpretation:")
print("""The correlation matrix and VIF values reveal severe multicollinearity among Education, Experience, and Age (all VIFs far exceed 10). This means these variables contain highly overlapping information.
As a result, individual coefficient estimates are imprecise their standard errors are inflated, and their magnitudes can vary widely when one regressor is removed. This explains why the coefficient on Education changed drastically when Age was excluded. This can also be seen from the correlation matrix where each variable closely follows the other.""")



Answer to question #3:
   Dependent  education  experience  Age
0         35         12           5   25
1         50         16          10   30
2         65         18          12   32
3         70         20          15   35
4         80         21          18   40
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
            education  experience       Age
education    1.000000    0.988212  0.964229
experience   0.988212    1.000000  0.993065
Age          0.964229    0.993065  1.000000
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
     Variable           VIF
0       const  26125.000000
1   education    452.266667
2  experience   2298.916667
3         Age    766.850000
-----------------------

In [6]:
print("""Answer to question #4:""")
t_value = 2.5 / 0.8
p = 2 * (1-stats.t.cdf(abs(t_value), df=1))
alpha = 0.05
print(f"t_value = {t_value:.3f}, p-value = {p_value:.5f}")

if p < alpha:
    print("Reject H0: Education significantly affects y")
else:
    print("Fail to Reject H0: We dont have enough information to say that Education significantly affects y")

Answer to question #4:


NameError: name 'p_value' is not defined

In [None]:
print("""Answer to question #5:""")
b = 2.5
se = 0.8
df = 1
alpha = 0.05
t_crit = stats.t.ppf(1 - alpha/2, df)
lower = b - t_crit * se
upper = b + t_crit * se


print(f"95% Confidence Interval: ({lower:.3f}, {upper:.3f})")
print(f"\nInterpretation:")
print(f"Holding Experience and Age constant, we are 95% confident the true effect of one more unit of Education lies between: ({lower:.3f}, {upper:.3f}) Because 0 is inside the interval, Education is not statistically significant at 5%.")

Answer to question #5:
95% Confidence Interval: (-7.665, 12.665)

Interpretation:
Holding Experience and Age constant, we are 95% confident the true effect of one more unit of Education lies between: (-7.665, 12.665) Because 0 is inside the interval, Education is not statistically significant at 5%..
