In [42]:
import pandas as pd
from sklearn import model_selection, pipeline, linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

In [2]:
raw_data = pd.read_excel('../../data/raw_data.xlsx')

In [3]:
data = raw_data[raw_data['JOB_TITLE'].str.contains('Technical Specialist')]

In [5]:
data.head()

Unnamed: 0,REGISTRATION,BRANCH,AREA,NAME,GENDER,ETHNICITY,JOB_TITLE,SALARY,CONTRACT_TYPE,EDUCATION,PWD,CONTRACT_TIME,AGE_RANGE,CONTRACT_REGIME
9,7161,Branch_4,Area_91,Sean Evans,M,WHITE,Technical Specialist Jr.,5312,Indefinite-Term,High School,0,5 - 10 Contract Years,35 - 40 Years,Monthly Contract
10,2734,Branch_4,Area_91,Diane Morales,M,BLACK,Technical Specialist Jr.,5214,Indefinite-Term,High School,0,> 10 Contract Years,55 - 60 Years,Monthly Contract
11,3227,Branch_4,Area_91,Julia Taylor,M,HISPANIC,Technical Specialist Mid.,11870,Indefinite-Term,Bachelor's Degree,0,> 10 Contract Years,> 60 Years,Monthly Contract
39,10861,Branch_4,Area_90,Ronald Roberts,F,ASIAN,Technical Specialist Jr.,5747,Indefinite-Term,High School,0,< 1 Contract Year,25 - 30 Years,Monthly Contract
40,7475,Branch_4,Area_90,Donna Flores,M,PREFER NOT TO SAY,Technical Specialist Sr.,18139,Indefinite-Term,Bachelor's Degree,0,3 - 5 Contract Years,55 - 60 Years,Monthly Contract


In [6]:
X = data[['GENDER','ETHNICITY','JOB_TITLE','CONTRACT_TYPE','EDUCATION','PWD','CONTRACT_TIME','AGE_RANGE','CONTRACT_REGIME']]
y = data['SALARY']

In [11]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X_train.shape

(421, 9)

In [14]:
X_test.shape

(106, 9)

In [32]:
categorical = ['GENDER','ETHNICITY','JOB_TITLE','CONTRACT_TYPE','EDUCATION','CONTRACT_TIME','AGE_RANGE','CONTRACT_REGIME']
numerical = ['PWD']

In [35]:
onehot = OneHotEncoder()
regressor = linear_model.LinearRegression()

In [34]:
preprocessing = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('numerical', 'passthrough', numerical)
    ]
)

In [36]:
model = pipeline.Pipeline(
    steps=[
        ('preprocessor', preprocessing),
        ('algorithm',regressor)
    ]
)

In [37]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['GENDER', 'ETHNICITY',
                                                   'JOB_TITLE', 'CONTRACT_TYPE',
                                                   'EDUCATION', 'CONTRACT_TIME',
                                                   'AGE_RANGE',
                                                   'CONTRACT_REGIME']),
                                                 ('numerical', 'passthrough',
                                                  ['PWD'])])),
                ('algorithm', LinearRegression())])

In [43]:
y_train_predict = model.predict(X_train)

In [45]:
print("R²:", r2_score(y_train, y_train_predict))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_train_predict)))
print("MAE:", mean_absolute_error(y_train, y_train_predict))

R²: 0.8850849116386232
RMSE: 1806.197709061829
MAE: 1268.6959785051477
