In [15]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Read in the data
data = pd.read_csv('hourly_wages_data.csv')

In [3]:
data.head()

Unnamed: 0,wage_per_hour,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
0,5.1,0,8,21,35,1,1,0,1,0
1,4.95,0,9,42,57,1,1,0,1,0
2,6.67,0,12,1,19,0,0,0,1,0
3,4.0,0,12,4,22,0,0,0,0,0
4,7.5,0,12,17,35,0,1,0,0,0


In [4]:
data.shape

(534, 10)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   wage_per_hour   534 non-null    float64
 1   union           534 non-null    int64  
 2   education_yrs   534 non-null    int64  
 3   experience_yrs  534 non-null    int64  
 4   age             534 non-null    int64  
 5   female          534 non-null    int64  
 6   marr            534 non-null    int64  
 7   south           534 non-null    int64  
 8   manufacturing   534 non-null    int64  
 9   construction    534 non-null    int64  
dtypes: float64(1), int64(9)
memory usage: 41.8 KB


In [6]:
data.isnull().sum()

wage_per_hour     0
union             0
education_yrs     0
experience_yrs    0
age               0
female            0
marr              0
south             0
manufacturing     0
construction      0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,wage_per_hour,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
count,534.0,534.0,534.0,534.0,534.0,534.0,534.0,534.0,534.0,534.0
mean,9.024064,0.179775,13.018727,17.822097,36.833333,0.458801,0.655431,0.292135,0.185393,0.044944
std,5.139097,0.38436,2.615373,12.37971,11.726573,0.498767,0.475673,0.45517,0.388981,0.207375
min,1.0,0.0,2.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0
25%,5.25,0.0,12.0,8.0,28.0,0.0,0.0,0.0,0.0,0.0
50%,7.78,0.0,12.0,15.0,35.0,0.0,1.0,0.0,0.0,0.0
75%,11.25,0.0,15.0,26.0,44.0,1.0,1.0,1.0,0.0,0.0
max,44.5,1.0,18.0,55.0,64.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# Split the data into features (X) and target (y)
X = data.drop(columns=['wage_per_hour'])
y = data['wage_per_hour']

In [9]:
X.head()

Unnamed: 0,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
0,0,8,21,35,1,1,0,1,0
1,0,9,42,57,1,1,0,1,0
2,0,12,1,19,0,0,0,1,0
3,0,12,4,22,0,0,0,0,0
4,0,12,17,35,0,1,0,0,0


In [10]:
y.head()

0    5.10
1    4.95
2    6.67
3    4.00
4    7.50
Name: wage_per_hour, dtype: float64

In [19]:
# Create the model and train it using the entire dataset
model = Sequential()
model.add(Dense(200, activation='relu', input_shape=(X.shape[1],)))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(1))

In [20]:
#compile model using mse as a measure of model performance
model.compile(optimizer='adam', loss='mean_squared_error')

In [21]:
#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)
#train model
model.fit(X, y, validation_split=0.2, epochs=30, callbacks=[early_stopping_monitor])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


<keras.callbacks.History at 0x1be430f2310>

In [22]:
# Calculate MAE and RMSE
mae = mean_absolute_error(y, model.predict(X))
rmse = mean_squared_error(y, model.predict(X), squared=False)

# Print the evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 3.314038962799958
Root Mean Squared Error (RMSE): 4.623328766521983


In [23]:
# Get user input for prediction
user_input = {}
for column in X.columns:
    user_input[column] = float(input(f"Enter value for {column}: "))

# Create a DataFrame from the user input
user_df = pd.DataFrame(user_input, index=[0])

# Make predictions using the trained model
predictions = model.predict(user_df)

# Print the predicted wage per hour
print("Predicted wage per hour:", predictions[0][0])

Enter value for union: 1
Enter value for education_yrs: 10
Enter value for experience_yrs: 1
Enter value for age: 40
Enter value for female: 0
Enter value for marr: 1
Enter value for south: 0
Enter value for manufacturing: 1
Enter value for construction: 0
Predicted wage per hour: 10.435108
