In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from utils import ScalerDf, split_data
from pycaret.classification import *

In [2]:
salary_data = pd.read_csv('./../data/raw/ds_salaries.csv')

In [3]:
quantiles = [0, 1/7, 2/7, 3/7, 4/7, 5/7, 6/7, 1]
bin_edges = [salary_data['salary'].quantile(q) for q in quantiles]

# Convert the continuous salary variable into 7 discrete bins based on quantiles
salary_labels = ['low', 'low-mid', 'mid', 'mid-high', 'high', 'very-high', 'Top']
salary_data['salary_range'] = pd.cut(salary_data['salary'], bins=bin_edges, labels=False, include_lowest=True)

In [4]:
X = salary_data.drop(columns=['salary', 'salary_in_usd', 'salary_range'])
y = salary_data['salary_range']
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y, test_size=0.2, val_size=0.25, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

(2253, 9)
(751, 9)
(751, 9)


In [5]:
# identifico variables numericas y categoricas
categoric_columns = X.select_dtypes(exclude=['int', 'float']).columns.tolist()
numeric_columns = X.select_dtypes(include=['int', 'float']).columns.tolist()

In [6]:
preprocess_pipeline = [
    ('rare_label_encoder', RareLabelEncoder(variables=categoric_columns, n_categories=1, tol=0.1)),
    ('Ordinal_encoder',OrdinalEncoder(variables=categoric_columns, encoding_method='ordered')),
    ('Scaler', ScalerDf(method='standard'))
]
salary_preprocess = Pipeline(preprocess_pipeline)

In [7]:
y_train = y_train.astype('Int64')

In [8]:
salary_preprocess.fit(X_train, y_train)

In [9]:
X_train['y'] = y_train
X_val['y'] = y_val

In [10]:
setup(  data= X_train, target = 'y',
            session_id = 0, 
            preprocess = False,
            test_data = X_val,
            index=False,
            verbose = False,custom_pipeline= salary_preprocess
                    )

<pycaret.classification.oop.ClassificationExperiment at 0x7f9cf7ea7100>

In [11]:
best = compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.2956,0.6973,0.2956,0.2837,0.2796,0.178,0.181,0.284
lightgbm,Light Gradient Boosting Machine,0.2947,0.6892,0.2947,0.3097,0.2866,0.177,0.1801,0.204
rf,Random Forest Classifier,0.2934,0.6833,0.2934,0.3058,0.2871,0.1753,0.1775,0.115
et,Extra Trees Classifier,0.2889,0.6822,0.2889,0.2947,0.2779,0.1702,0.1729,0.227
lr,Logistic Regression,0.2899,0.6731,0.2899,0.2563,0.2526,0.171,0.1768,0.772
lda,Linear Discriminant Analysis,0.2934,0.6668,0.2934,0.2552,0.2515,0.1752,0.1819,0.122
ada,Ada Boost Classifier,0.285,0.6601,0.285,0.2724,0.26,0.1651,0.1716,0.104
dt,Decision Tree Classifier,0.2832,0.656,0.2832,0.2962,0.2753,0.1634,0.166,0.076
qda,Quadratic Discriminant Analysis,0.2361,0.6297,0.2361,0.1746,0.1653,0.1079,0.1279,0.088
knn,K Neighbors Classifier,0.2756,0.629,0.2756,0.2715,0.2624,0.1548,0.1569,0.571


In [66]:
best_tuned = tune_model(best)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.2832,0.7041,0.2832,0.3043,0.2599,0.1631,0.1713
1,0.2832,0.6976,0.2832,0.3043,0.274,0.1632,0.1676
2,0.3142,0.6964,0.3142,0.3144,0.2954,0.1995,0.2033
3,0.2978,0.6812,0.2978,0.3033,0.2844,0.1802,0.1827
4,0.2667,0.6471,0.2667,0.2684,0.2624,0.1441,0.1451
5,0.3156,0.7116,0.3156,0.2757,0.2876,0.2012,0.2046
6,0.3156,0.6863,0.3156,0.3507,0.287,0.201,0.2061
7,0.3111,0.7003,0.3111,0.3115,0.3033,0.1957,0.1985
8,0.32,0.7209,0.32,0.3554,0.3153,0.2065,0.2091
9,0.3244,0.702,0.3244,0.3543,0.3079,0.2117,0.2172


In [16]:
final_model = finalize_model(best)

In [21]:
pred = final_model.predict(X_train.drop(columns='y'))

In [24]:
from sklearn.metrics import classification_report, roc_auc_score

In [35]:
print(classification_report(y_train, pred))

              precision    recall  f1-score   support

         0.0       0.63      0.74      0.68       319
         1.0       0.33      0.37      0.35       313
         2.0       0.35      0.28      0.31       321
         3.0       0.26      0.17      0.21       327
         4.0       0.35      0.11      0.17       324
         5.0       0.28      0.62      0.38       329
         6.0       0.31      0.20      0.24       320

    accuracy                           0.36      2253
   macro avg       0.36      0.36      0.33      2253
weighted avg       0.36      0.36      0.33      2253



In [32]:
y_train.reset_index()['salary_range']

0       0
1       2
2       1
3       3
4       5
       ..
2248    1
2249    4
2250    2
2251    6
2252    5
Name: salary_range, Length: 2253, dtype: Int64