In [10]:
import pandas as pd
# load data
train_data = pd.read_csv('./usjobs_train.csv')
test_data = pd.read_csv('./usjobs_test.csv')


In [11]:
# note that for train data, the mean_salary values contribute to an additional column compared to test data, we take the mean_salary column out of train_data as train_target

train_target = train_data['Mean_Salary']
train_data = train_data.drop(columns=['ID','Mean_Salary'])
print(train_target)
test_target = pd.read_csv('./usjobs_sample_submission.csv')['Mean_Salary']
print(test_target)
# now you notice that no actual salary for the test data is given, so that we only need to train and evaluate model performance on train data
test_data = test_data.drop(columns=['ID'])

0        115000.000
1        185000.000
2         84500.000
3        111625.000
4        102690.400
            ...    
33243     47206.495
33244     79741.000
33245    119908.000
33246    115000.000
33247    155000.000
Name: Mean_Salary, Length: 33248, dtype: float64
0        105000
1        105000
2        105000
3        105000
4        105000
          ...  
22161    105000
22162    105000
22163    105000
22164    105000
22165    105000
Name: Mean_Salary, Length: 22166, dtype: int64


In [12]:
# check if any null value in train_data
print(train_data.isnull().sum())

# turns out there are many null values in multiple columns. We should fill the columns of strings with 'None' and columns of numbers with 0, otherwise we have to drop a lot of them
# but first, there are columns of information that are probably not useful for prediction, let's check, by counting the number of unique values in each column
print(train_data.nunique()/train_data.count())
# a ratio higher than 0.25 is potentially not a good indicator

# drop the columns with a ratio higher than 0.25
NG_columns = train_data.columns[(train_data.nunique()/train_data.count() > 0.25)]#.index
train_data = train_data.drop(columns=NG_columns)
test_data = test_data.drop(columns=NG_columns)

Job                     0
Jobs_Group              0
Profile             21107
Remote              19319
Company                 9
Location               13
City                 3824
State                3112
Frecuency_Salary        0
Skills                  0
Sector               7214
Sector_Group         7214
Revenue             18318
Employee            12799
Company_Score        8762
Reviews              8762
Director            20785
Director_Score      21924
URL                 16033
dtype: int64
Job                 0.518136
Jobs_Group          0.000421
Profile             0.000247
Remote              0.000144
Company             0.421042
Location            0.377373
City                0.100292
State               0.001792
Frecuency_Salary    0.000150
Skills              0.324982
Sector              0.005301
Sector_Group        0.001076
Revenue             0.000603
Employee            0.000440
Company_Score       0.001552
Reviews             0.056359
Director            0.209661


In [13]:
# now fill in the null values. For string type columns, fill in with 'None', for numerical type columns, fill in with 0
# first check whether the columns are numerical or string type
print(train_data.dtypes)
string_columns = train_data.columns[train_data.dtypes == 'object']
numerical_columns = train_data.columns[train_data.dtypes != 'object']
train_data[string_columns] = train_data[string_columns].fillna('None')
train_data[numerical_columns] = train_data[numerical_columns].fillna(0)


test_data[string_columns] = test_data[string_columns].fillna('None')
test_data[numerical_columns] = test_data[numerical_columns].fillna(0)



Jobs_Group           object
Profile              object
Remote               object
City                 object
State                object
Frecuency_Salary     object
Sector               object
Sector_Group         object
Revenue              object
Employee             object
Company_Score       float64
Reviews             float64
Director             object
Director_Score      float64
dtype: object


In [14]:
# encoding categorical data to numerical

from sklearn.preprocessing import LabelEncoder
encoded_train_data = train_data.copy()
encoded_test_data = test_data.copy()
labelencoder = LabelEncoder()
mapping = {}
print(encoded_train_data.dtypes)
for column in string_columns:
    encoded_train_data[column] = labelencoder.fit_transform(encoded_train_data[column])
    mapping[column] = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
    # for values in encoded_test_data[column], if the value is not in the mapping, then we set it to be 'None'
    encoded_test_data[column] = encoded_test_data[column].apply(lambda x: x if x in mapping[column] else 'None')
    encoded_test_data[column] = labelencoder.transform(encoded_test_data[column])

print(mapping)
print(encoded_train_data.head())
print(encoded_train_data.dtypes)

Jobs_Group           object
Profile              object
Remote               object
City                 object
State                object
Frecuency_Salary     object
Sector               object
Sector_Group         object
Revenue              object
Employee             object
Company_Score       float64
Reviews             float64
Director             object
Director_Score      float64
dtype: object
{'Jobs_Group': {'Analyst': 0, 'Business Analyst': 1, 'Business Intelligence': 2, 'CFO': 3, 'Controller': 4, 'Data Analyst': 5, 'Data Engineer': 6, 'Data Scientist': 7, 'Finance': 8, 'Financial Analyst': 9, 'ML/AI Engineer': 10, 'Operations Analyst': 11, 'Others': 12, 'Statistician/Mathemathics': 13}, 'Profile': {'Junior': 0, 'Lead': 1, 'None': 2, 'Senior': 3}, 'Remote': {'Hybrid': 0, 'None': 1, 'Remote': 2}, 'City': {'Abbotsford': 0, 'Abbott Park': 1, 'Aberdeen': 2, 'Aberdeen Proving Ground': 3, 'Abernathy': 4, 'Abilene': 5, 'Acworth': 6, 'Ada': 7, 'Ada County': 8, 'Adairsville': 9, 'Ada

In [7]:
# establish accuracy baseline
train_label_mean = train_target.mean()
mean_absolute_error_baseline = abs(train_target - train_label_mean).mean()
print(mean_absolute_error_baseline)

33020.98790570812


In [15]:
# try all basic machine learning models to fit encoded_train_data to train_target
# from sklearn import linear_model, svm

from sklearn.metrics import mean_absolute_error

# for model in [linear_model.LinearRegression(),
#               linear_model.Ridge(),
#               linear_model.Lasso(),
#               linear_model.BayesianRidge(),
#               linear_model.ElasticNet(),
#               linear_model.LassoLars(),
#               linear_model.ARDRegression(),
#               linear_model.PassiveAggressiveRegressor(),
#               linear_model.TheilSenRegressor(),
#               linear_model.HuberRegressor(),
#               linear_model.RANSACRegressor(),
#               svm.SVR()]:
#     model.fit(encoded_train_data, train_target)
#     predictions = model.predict(encoded_train_data)
#     print(model)
#     print(mean_absolute_error(train_target, predictions))

# obviously these basic models are not good enough, let's try some ensemble methods

In [16]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, VotingRegressor, StackingRegressor, IsolationForest

estimators = [
    # ('RandomForest', RandomForestRegressor(n_estimators=100)),
    # ('GradientBoosting', GradientBoostingRegressor(n_estimators=100)),
    ('ExtraTrees', ExtraTreesRegressor(n_estimators=100)),
    # ('AdaBoost', AdaBoostRegressor(n_estimators=100)),
    # ('Bagging', BaggingRegressor(n_estimators=100)),
    # ('IsolationForest', IsolationForest(n_estimators=100))
]

for name, model in estimators:
    model.fit(encoded_train_data, train_target)
    predictions = model.predict(encoded_train_data)
    print(name)
    print(mean_absolute_error(train_target, predictions))

# final conclusion: ExtraTreesRegressor might be the best model, wth mean absolute error in train data of 2669

ExtraTrees
2669.084444929978


In [17]:
test_result = pd.read_csv('./usjobs_sample_submission.csv')
# preprocess test data and make predictions as y_pred
test_result['Mean_Salary'] = model.predict(encoded_test_data)
test_result.to_csv('./test_result.csv', index=False)
