In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import summary_table

In [2]:
df = pd.read_csv('wage.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,year,age,sex,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,231655,2006,18,1. Male,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,86582,2004,24,1. Male,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.47602
2,161300,2003,45,1. Male,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,155159,2003,43,1. Male,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,11443,2005,50,1. Male,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.318063,75.043154


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  3000 non-null   int64  
 1   year        3000 non-null   int64  
 2   age         3000 non-null   int64  
 3   sex         3000 non-null   object 
 4   maritl      3000 non-null   object 
 5   race        3000 non-null   object 
 6   education   3000 non-null   object 
 7   region      3000 non-null   object 
 8   jobclass    3000 non-null   object 
 9   health      3000 non-null   object 
 10  health_ins  3000 non-null   object 
 11  logwage     3000 non-null   float64
 12  wage        3000 non-null   float64
dtypes: float64(2), int64(3), object(8)
memory usage: 304.8+ KB


In [4]:
# Find Null values in dataset
df.isnull().sum()

Unnamed: 0    0
year          0
age           0
sex           0
maritl        0
race          0
education     0
region        0
jobclass      0
health        0
health_ins    0
logwage       0
wage          0
dtype: int64

In [5]:
# Object columns:
object_col = df.dtypes == object
object_col[object_col == True].index

Index(['sex', 'maritl', 'race', 'education', 'region', 'jobclass', 'health',
       'health_ins'],
      dtype='object')

In [6]:
df[object_col[object_col == True].index].head()

Unnamed: 0,sex,maritl,race,education,region,jobclass,health,health_ins
0,1. Male,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No
1,1. Male,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No
2,1. Male,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes
3,1. Male,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes
4,1. Male,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes


In [7]:
df[object_col[object_col == True].index].describe()
# 3 of object type columns have 2 uniques variable
# sex column only has 1 variable
# maritl and education have 5 variables
# race has 4 variables
# We have to convert them into numeric variable if we want to use them later on

Unnamed: 0,sex,maritl,race,education,region,jobclass,health,health_ins
count,3000,3000,3000,3000,3000,3000,3000,3000
unique,1,5,4,5,1,2,2,2
top,1. Male,2. Married,1. White,2. HS Grad,2. Middle Atlantic,1. Industrial,2. >=Very Good,1. Yes
freq,3000,2074,2480,971,3000,1544,2142,2083


### Polynomial Regression and Step Functions

In [11]:
# Define a 5th order polynomial function
poly_order = 4 
poly = 'wage ~ 1 +' + '+'.join(['I(age**{})'.format(i) for i in np.arange(1, poly_order + 1)])

print(poly)

# Define the model
model_poly = smf.ols(poly, data=df[['age', 'wage']])

# Fit the model
result_poly = model_poly.fit()

# Summary of fitted model
result_poly.summary()

wage ~ 1 +I(age**1)+I(age**2)+I(age**3)+I(age**4)


0,1,2,3
Dep. Variable:,wage,R-squared:,0.086
Model:,OLS,Adj. R-squared:,0.085
Method:,Least Squares,F-statistic:,70.69
Date:,"Wed, 14 Oct 2020",Prob (F-statistic):,2.77e-57
Time:,06:22:44,Log-Likelihood:,-15315.0
No. Observations:,3000,AIC:,30640.0
Df Residuals:,2995,BIC:,30670.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-184.1542,60.040,-3.067,0.002,-301.879,-66.430
I(age ** 1),21.2455,5.887,3.609,0.000,9.703,32.788
I(age ** 2),-0.5639,0.206,-2.736,0.006,-0.968,-0.160
I(age ** 3),0.0068,0.003,2.221,0.026,0.001,0.013
I(age ** 4),-3.204e-05,1.64e-05,-1.952,0.051,-6.42e-05,1.45e-07

0,1,2,3
Omnibus:,1097.594,Durbin-Watson:,1.96
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4965.521
Skew:,1.722,Prob(JB):,0.0
Kurtosis:,8.279,Cond. No.,567000000.0


In [13]:
table, data, col_names = summary_table(result_poly, alpha=0.05)

# Print column name of summary table
print(col_names)

# Array of fitted values
fitted_val = result_poly.fittedvalues.values

# Predicted, We can use this:
predicted_1 = data[:, 2].T

# Or use predict method:
#age_range = np.linspace(df['age'].min(), df['age'].max(), 1000)
#predicted_2 = model_poly.predict(age_range)

# predict low confidence interval
low_ci = data[:, 4].T

# predict high confidence interval
high_ci = data[:, 5].T

# Dataframe of Fitted values, residuals and studentized residuals
df_r = pd.DataFrame({'Predicted': predicted_1, 'low_ci': low_ci, 'high_ci': high_ci} )

['Obs', 'Dep Var\nPopulation', 'Predicted\nValue', 'Std Error\nMean Predict', 'Mean ci\n95% low', 'Mean ci\n95% upp', 'Predict ci\n95% low', 'Predict ci\n95% upp', 'Residual', 'Std Error\nResidual', 'Student\nResidual', "Cook's\nD"]


In [14]:
df_r['age'] = df['age']
df_r.sort_values(by = 'age', axis = 0, inplace = True)

In [15]:
%matplotlib notebook
plt.scatter(df['age'], df['wage'], alpha = 0.5)
plt.plot(df_r['age'], df_r['high_ci'], color = 'r', linestyle = '--', label = '95% CI')
plt.plot(df_r['age'], df_r['Predicted'], color = 'r', label = 'Prediction')
plt.plot(df_r['age'], df_r['low_ci'], color = 'r', linestyle = '--')
plt.legend()
plt.xlabel('Age')
plt.ylabel('Wage')
plt.tight_layout()
plt.savefig('4th_order_poly_regression_wage_vs_age.png', dpi = 100)

<IPython.core.display.Javascript object>

### ANOVA

In [18]:
"""In performing a polynomial regression we must decide on the degree of
the polynomial to use. One way to do this is by using hypothesis tests. We
now fit models ranging from linear to a degree-5 polynomial and seek to
determine the simplest model which is sufficient to explain the relationship"""
poly_result = []
for poly_order in [1,2,3,4,5]:
    poly_formula = 'wage ~ 1 +' + '+'.join(['I(age**{})'.format(i) for i in np.arange(1, poly_order + 1)])
    # Define the model and fit it
    poly_result.append(smf.ols(poly_formula, data=df[['age', 'wage']]).fit())

In [20]:
# Using ANOVA to compare models
sm.stats.anova_lm(*poly_result)

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,2998.0,5022216.0,0.0,,,
1,2997.0,4793430.0,1.0,228786.010128,143.593107,2.36385e-32
2,2996.0,4777674.0,1.0,15755.693664,9.888756,0.001679202
3,2995.0,4771604.0,1.0,6070.152124,3.809813,0.0510462
4,2994.0,4770322.0,1.0,1282.563017,0.804976,0.369682


In [21]:
# 4th order polynomial shows P-value of 0.05 which is acceptable but
# 5th order P-value is 0.37. So we can go with a 4th order. 

### Logistic Regression Model

In [50]:
"""Next we consider the task of predicting whether an individual earns more
than $250,000 per year."""

# Convert wage column into a binary column, wage > 250k is 1, else 0.
df['High_wage'] = df.wage.apply(lambda x: 1.0 if x > 250 else 0.0)

# Create 4th order polynomial formula
poly_order = 4
formula = 'High_wage ~ 1 +' + '+'.join(['I(age**{})'.format(i) for i in np.arange(1, poly_order + 1)])

# Create Logistic regression model
LR_model = smf.logit(formula, data=df)

# Fit the model
LR_result = LR_model.fit()

Optimization terminated successfully.
         Current function value: 0.116870
         Iterations 12
