In [0]:
%run ../feature_engineering

In [0]:
from pycaret.regression import *

Out[28]: {'EdLevel': {'Primary/elementary school': 1.0,
  'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 2.0,
  'Associate degree (A.A., A.S., etc.)': 3.0,
  'Some college/university study without earning a degree': 4.0,
  'Something else, Professional degree (JD, MD, etc.)': 5.0,
  'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 6.0,
  'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 7.0,
  'Other doctoral degree (Ph.D., Ed.D., etc.)': 8.0},
 'Age1stCode': {'Younger than 5 years': 1.0,
  '5 - 10 years': 2.0,
  '11 - 17 years': 3.0,
  '18 - 24 years': 4.0,
  '25 - 34 years': 5.0,
  '35 - 44 years': 6.0,
  '45 - 54 years': 7.0,
  '55 - 64 years': 8.0,
  'Older than 64 years': 9.0},
 'OrgSize': {'Just me - I am a freelancer, sole proprietor, etc.': 1.0,
  '2 to 9 employees': 2.0,
  '10 to 19 employees': 3.0,
  '20 to 99 employees': 4.0,
  '100 to 499 employees': 5.0,
  'I don’t know': 6.0,
  '500 to 999 employees': 7.0,
  '1,000 to 4,999 employees

In [0]:
# Setup constants
TRAIN_SIZE: float = 0.8
SEED: int = 42

<IPython.core.display.Javascript object>

In [0]:
df = _set_df()

In [0]:
display(df)

In [0]:
pdf = df.toPandas()

In [0]:
display(pdf)

In [0]:
pdf[ORDINAL_COLUMNS].isnull().sum()

In [0]:
pdf[ORDINAL_COLUMNS] = pdf[ORDINAL_COLUMNS].fillna('missing')

In [0]:
ed_level = pdf.EdLevel.unique()
ed_level

In [0]:
ordered_ed_level = [
    "missing",
    "Primary/elementary school",
    "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
    "Associate degree (A.A., A.S., etc.)",
    "Some college/university study without earning a degree",
    "Something else",
    "Professional degree (JD, MD, etc.)",
    "Bachelor’s degree (B.A., B.S., B.Eng., etc.)",
    "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",
    "Other doctoral degree (Ph.D., Ed.D., etc.)",
]

In [0]:
age_fst_code = pdf.Age1stCode.unique()
age_fst_code

In [0]:
ordered_age_fst_code = [
    "missing",
    "Younger than 5 years",
    "5 - 10 years",
    "11 - 17 years",
    "18 - 24 years",
    "25 - 34 years",
    "35 - 44 years",
    "45 - 54 years",
    "55 - 64 years",
    "Older than 64 years",
]

In [0]:
org_size = pdf.OrgSize.unique()
org_size

In [0]:
ordered_org_size = [
  'missing',
  'Just me - I am a freelancer, sole proprietor, etc.',
  '2 to 9 employees',
  '10 to 19 employees',
  '20 to 99 employees',
  '100 to 499 employees',
  'I don’t know',
  '500 to 999 employees',
  '1,000 to 4,999 employees',
  '5,000 to 9,999 employees',
  '10,000 or more employees'
]

In [0]:
age = pdf.Age.unique()
age

In [0]:
ordered_age = [
    "missing",
    "Under 18 years old",
    "18-24 years old",
    "25-34 years old",
    "35-44 years old",
    "45-54 years old",
    "55-64 years old",
    "65 years or older",
    "Prefer not to say",
]

In [0]:
years_code_bin = pdf.YearsCode_bin.unique()
years_code_bin

In [0]:
ordered_years_code_bin = [
    "missing",
    "0 - 3",
    "3 - 5",
    "5 - 10",
    "10 - 20",
    "20 - 30",
    "30 - 40",
    "40 - 50",
    "50 - 60",
]

In [0]:
ordinal_features = {
    "EdLevel": ordered_ed_level,
    "Age1stCode": ordered_age_fst_code,
    "OrgSize": ordered_org_size,
    "Age": ordered_age,
    "YearsCode_bin": ordered_years_code_bin,
    "YearsCodePro_bin": ordered_years_code_bin,
}

In [0]:
exp_setup = setup(data=pdf, target=TARGET_COL, train_size=TRAIN_SIZE, ordinal_features=ordinal_features, use_gpu=True, silent=True)

In [0]:
best_model = compare_models(include=["lr", "lasso", "ridge"])  # lasso is the best

In [0]:
best_model = compare_models(
    include=["en", "lar", "llar", "br"]
)  # llar is the best - Lasso Least Angle Regression. Second - 'br' - 'Bayesian Ridge' - RMSE 29656, R2 0.6205

In [0]:
#best_model = compare_models(include=['omp', 'br', 'ard'])

In [0]:
# best_model = compare_models(include=['br', 'svm', 'knn'])

In [0]:
# best_model = compare_models(include=['br'])

In [0]:
# best_model = compare_models(include=['svm'])

In [0]:
best_model = compare_models(
    include=["rf", "ada", "gbr"]
)  # The best - 'gbr' RMSE 29846 R2 0.6156 . Second - 'rf' - RMSE 30070 R2 0.6098

In [0]:
# best_model = compare_models(include=['rf'])  # RMSE 30070 R2 0.6098

In [0]:
best_model = compare_models(include=["lightgbm"])  # lightgbm - RSME 28768 R2 0.6428

In [0]:
best_model = compare_models(include=["xgboost"])  # An empty result in xgboost.