In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm

In [None]:
#load the data into a dataframe
wages=pd.read_csv('https://raw.githubusercontent.com/danielbauer1979/MSDIA_PredictiveModelingAndMachineLearning/refs/heads/main/GB886_II_9_Wages_1985_Current_Population_Survey.csv')
wages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Yrs_Ed      534 non-null    int64  
 1   Sthrn_Rgn   534 non-null    int64  
 2   Sex         534 non-null    int64  
 3   Yrs_Exprnc  534 non-null    int64  
 4   Union       534 non-null    int64  
 5   Wage        534 non-null    float64
 6   Age         534 non-null    int64  
 7   Race        534 non-null    object 
 8   Occup       534 non-null    object 
 9   Sect        534 non-null    object 
 10  Marr        534 non-null    int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 46.0+ KB


In [None]:
#find numerical variables
numerical=list(wages.select_dtypes(include=['int64','float64']).columns)
#find categorical variables
categorical = list(wages.select_dtypes(include=['object']).columns)
#separate numerical and categorical variables
wages_numerical = wages[numerical]
wages_categorical = wages[categorical]
print("Numerical columns:",numerical)
print("Categorical Columns:", categorical)

Numerical columns: ['Yrs_Ed', 'Sthrn_Rgn', 'Sex', 'Yrs_Exprnc', 'Union', 'Wage', 'Age', 'Marr']
Categorical Columns: ['Race', 'Occup', 'Sect']


In [None]:
#create dummies
dummies = pd.get_dummies(wages_categorical, drop_first=True)
print(dummies)

     Race_O  Race_W  Occup_Mngmnt  Occup_Other  Occup_Prof  Occup_Sales  \
0     False   False         False         True       False        False   
1     False    True         False         True       False        False   
2     False    True         False         True       False        False   
3     False    True         False         True       False        False   
4     False    True         False         True       False        False   
..      ...     ...           ...          ...         ...          ...   
529   False    True         False        False        True        False   
530    True   False         False        False        True        False   
531    True   False         False        False        True        False   
532   False    True         False        False        True        False   
533   False    True         False        False        True        False   

     Occup_Service  Sect_Manf  Sect_Other  
0            False       True       False  
1          

In [None]:
# combine numerical and dummies
wages_cleaned = pd.concat([wages_numerical, dummies], axis=1)
print("wages_cleaned data:",wages_cleaned.head())

wages_cleaned data:    Yrs_Ed  Sthrn_Rgn  Sex  Yrs_Exprnc  Union  Wage  Age  Marr  Race_O  Race_W  \
0       8          0    1          21      0  5.10   35     1   False   False   
1       9          0    1          42      0  4.95   57     1   False    True   
2      12          0    0           1      0  6.67   19     0   False    True   
3      12          0    0           4      0  4.00   22     0   False    True   
4      12          0    0          17      0  7.50   35     1   False    True   

   Occup_Mngmnt  Occup_Other  Occup_Prof  Occup_Sales  Occup_Service  \
0         False         True       False        False          False   
1         False         True       False        False          False   
2         False         True       False        False          False   
3         False         True       False        False          False   
4         False         True       False        False          False   

   Sect_Manf  Sect_Other  
0       True       False  
1     

In [None]:
#separate features and outcome
y=wages_cleaned['Wage']
x=wages_cleaned.drop(columns=['Wage'])

In [None]:
#add constant
X=sm.add_constant(x)
#fit model
model_sm=sm.OLS(y,X.astype(float)).fit()
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                   Wage   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.306
Method:                 Least Squares   F-statistic:                     15.66
Date:                Sun, 12 Oct 2025   Prob (F-statistic):           3.02e-35
Time:                        18:58:20   Log-Likelihood:                -1525.8
No. Observations:                 534   AIC:                             3086.
Df Residuals:                     517   BIC:                             3158.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.2814      6.741     -0.042

In [None]:
#get predictions using numpy
y_pred=model_sm.predict(X)
r_squared = model_sm.rsquared
print(f"R-squared:{r_squared:.4f}")
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"RMSE:${rmse:.2f}")
mae = mean_absolute_error(y, y_pred)
print(f"MAE:${mae:.2f}")
mape = np.mean(np.abs((y - y_pred) / y)) * 100
print(f"MAPE:{mape:.2f}%")

R-squared:0.3265
RMSE:$4.21
MAE:$2.99
MAPE:40.69%


Model Discussion:
The regression model achieved an R-Squared value of 0.3265 meaning roughly only 33% of variance in wages is explained in the model. The RMSE of $4.21 and MAE of $2.99 show normal prediction error. The MAPE of 40.69% suggests predictions are off by 40% meaning this model has lots error.

Use Cases: This model would be useful for initial policy anaysis regarding Union membership, discriminatory wage patterns amongst sexes, and career choice selection.

Problematic Conclusions:
We cannot determine causation from the data. The low R-Squared value shows roughly 67% of wage variation unexplained in the model. The model is also missing feature variables such as location, company size, lenght of tenure etc. The dummy variables may oversimplify .