**Начать стоит с того, что датасет очень объемный в части фич для каждого работяги. Итак, нам нужно будет сначала понять, что представляет собой каждая фича, и как она влияет на уровень зарплаты**

In [None]:
import pandas as pd


train = pd.read_csv("engineers_salary_prediction/train.csv")
test = pd.read_csv("engineers_salary_prediction/test.csv")

print(train.info())
print(train.describe())
print(train.head())
print(train.isnull().sum())

<Data Dictionary>

- obs: Observation number of the data
- job_title: Anonymized job title
- job_posted_date: Year and month when the job was posted
- salary_category: The salary category (this is the target variable)
- job_state: State where the job is located
- feature_1 ~ feature_12: Various independent variables relating to job information
- job_desc_1 ~ job_desc_300: Vectorized representation of the job description

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


sns.countplot(data = train, x = 'salary_category')
plt.title("Distr of salary category as training feature")
plt.show()

В целом класс сбаланирован, но интересен момент, что инженеров с высоким уровнем дохода больше(хотя сначала могло казаться, что по стандартному нормальному распределению средний уровень зарплаты должен превалировать над другими).

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(data = train, x = 'job_title', hue = 'salary_category')
plt.xticks(rotation = 90)
plt.title('Job title distr by salary category')
plt.show()

не понял

In [None]:
numeric_features = [f'feature_{i}' for i in range(1, 12)]

for feature in numeric_features:
    plt.figure(figsize = (10, 5))
    sns.histplot(train[feature], kde = True, bins = 30)
    plt.title(f'Distr of {feature}')
    plt.show()

Стандартизировать значения не имеет смысла - распределение фичей не нормальное

In [None]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
train['salary_category_encoded'] = le.fit_transform(train['salary_category'])

train['job_title_encoded'] = le.fit_transform(train['job_title'])
train['job_state_encoded'] = le.fit_transform(train['job_state'])
train['job_posted_date'] = pd.to_datetime(train['job_posted_date'])
train['year'] = train['job_posted_date'].dt.year
train['month'] = train['job_posted_date'].dt.month

job_desc_columns = [f'job_desc_{str(i).zfill(3)}' for i in range(1, 301)]
numeric_features = [f'feature_{i}' for i in range (1, 13)] + job_desc_columns + ['job_title_encoded', 'job_state_encoded', 'year', 'month', 'salary_category_encoded']
numeric_data = train[numeric_features]

non_numeric_columns = numeric_data.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_columns)

for col in non_numeric_columns:
    le = LabelEncoder()
    numeric_data[col] = le.fit_transform(train[col])

missing_features = [col for col in numeric_features if col not in train.columns]
if missing_features:
    print(f"Missing features: {missing_features}")
else:
    print("All features presented in the dataset")

# zero_check_features = [f'job_desc_{str(i).zfill(3)}' for i in range(1, 301)]
# rows_with_zeros = (numeric_data[zero_check_features] == 0 ).all(axis = 1)
# print(f"ABCD: {rows_with_zeros.sum()}")
# train_filtered = numeric_data[~rows_with_zeros]

#print(numeric_data.dtypes)
#print(numeric_data[zero_check_features].dtypes)
# print(f"Original: {len(numeric_data)}")
# print(f"Filtered: {len(train_filtered)}")

corr_matrix = numeric_data.corr()

correlation_with_target = corr_matrix['salary_category_encoded'].sort_values(ascending = False)

print("Correlation eith salary category")
print(correlation_with_target)

plt.figure(figsize = (20, 20))
sns.heatmap(corr_matrix, annot = False, cmap = 'coolwarm', fmt = '.2f')
plt.title("Correlation matrix")
plt.show()

По тепловой карте замечаю, что фичи скоррелированы в необычном формате - карта выглядит симметричной и сегментированной на области(некая "сетка"). Скорее всего это связано с тем, что датасет и фичи в частности были сгенерированы по какому-то распределению(незвестному нам, очевидно).

**В идеях не использовать фичи, скоррелированные с salary_category более чем на |0.5|**

In [None]:
correlation_threshold = 0.5

high_corr_features = correlation_with_target[(correlation_with_target.abs() > correlation_threshold) & (correlation_with_target.index != 'salary_category_encoded')].index

print("Features with high correlation:")
print(high_corr_features)
numeric_data_filtered = numeric_data.drop(columns = high_corr_features)

# all_zero_rows = (numeric_data_filtered == 0).all(axis = 1)

# print(all_zero_rows.sum())

# numeric_data_filtered = numeric_data_filtered[~all_zero_rows]

**Таких фич не оказалось**

In [27]:
from sklearn.model_selection import train_test_split


y = numeric_data_filtered['salary_category_encoded'] #target for predict

X = numeric_data_filtered.drop(columns = ['salary_category_encoded']) #delete target from dataset

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score


# indices = np.arange(X_train.shape[0])
# np.random.shuffle(indices)

# X_train_shuffled = X_train.iloc[indices] if isinstance(X_train, pd.DataFrame) else X_train[indices]
# y_train_shuffled = y_train.iloc[indices] if isinstance(y_train, pd.Series) else y_train[indices]

# train_pool = Pool(X_train_shuffled, label=y_train_shuffled)
# val_pool = Pool(X_val, label=y_val)

model = CatBoostClassifier(
    iterations=700,
    learning_rate=0.2,
    random_state=42,
    verbose=100,
    eval_metric='Accuracy',
    l2_leaf_reg=3,
    random_strength=2,
    subsample=0.8,
    colsample_bylevel=0.8,
    bootstrap_type='Bernoulli',
    #early_stopping_rounds=50
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val)
)

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

model.save_model('catboost_model.cbm')

loaded_model = CatBoostClassifier()
loaded_model.load_model('catboost_model.cbm')

predictions = loaded_model.predict(X_val)
accuracy = accuracy_score(y_val, predictions)
print(f"Validation Accuracy (loaded model): {accuracy:.4f}")

Лучший результат == 0.8047

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


model = XGBClassifier(n_estimators=1500, learning_rate=0.1, random_state=42)

# param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [1000, 2000, 3000]}
# grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=3, scoring='accuracy')
# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
model.save_model("xgboost.json")
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}")

{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000}

In [None]:
from lightgbm import LGBMClassifier


model = LGBMClassifier(n_estimators = 5000, learning_rate = 0.1, random_state = 42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
#model.save_model('lightgbm_model.cbm')
print(f"Validation aacuracy: {accuracy_score(y_val, y_pred):.4f}")

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 5, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

study = optuna.create_study(direction="maximize")

study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)
print("Best accuracy:", study.best_value)

In [None]:
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier(n_estimators=420, max_depth = 36, min_samples_split=17, min_samples_leaf=1, bootstrap = False)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}")

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


estimators = [('xgb', XGBClassifier()), ('lgbm', LGBMClassifier())]
model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(f"Validation accuracy: {accuracy_score(y_val, y_pred):.4f}")

In [None]:
# for col in numeric_features:
#     train[col] = pd.to_numeric(train[col], errors='coerce')
#     test[col] = pd.to_numeric(test[col], errors='coerce')

# train[numeric_features] = train[numeric_features].fillna(0)
# test[numeric_features] = test[numeric_features].fillna(0)

# train['feature_sum'] = train[numeric_features[:12]].sum(axis=1)
# train['feature_mean'] = train[numeric_features[:12]].mean(axis=1)
# train['feature_std'] = train[numeric_features[:12]].std(axis=1)

# test['feature_sum'] = test[numeric_features[:12]].sum(axis=1)
# test['feature_mean'] = test[numeric_features[:12]].mean(axis=1)
# test['feature_std'] = test[numeric_features[:12]].std(axis=1)

# train['feature_1_x_feature_2'] = train['feature_1'] * train['feature_2']
# train['feature_1_div_feature_2'] = train['feature_1'] / (train['feature_2'] + 1e-6)

# test['feature_1_x_feature_2'] = test['feature_1'] * test['feature_2']
# test['feature_1_div_feature_2'] = test['feature_1'] / (test['feature_2'] + 1e-6)

# new_features = ['feature_sum', 'feature_mean', 'feature_std', 'feature_1_x_feature_2', 'feature_1_div_feature_2']
# for col in new_features:
#     train[col] = pd.to_numeric(train[col], errors='coerce')
#     test[col] = pd.to_numeric(test[col], errors='coerce')

# test['job_title_encoded'] = le.transform(test['job_title'])
# test['job_state_encoded'] = le.transform(test['job_state'])

# test['job_posted_date'] = pd.to_datetime(test['job_posted_date'])
# test['year'] = test['job_posted_date'].dt.year
# test['month'] = test['job_posted_date'].dt.month

# missing_features = set(train.columns) - set(test.columns)
# for feature in missing_features:
#     test[feature] = 0

# if 'salary_category_encoded' in numeric_features:
#     numeric_features.remove('salary_category_encoded')

# numeric_features += new_features

# assert set(train.columns) == set(test.columns), "Train and test columns do not match!"

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


numeric_features = [f'feature_{i}' for i in range(1, 13)] + \
                   [f'job_desc_{str(i).zfill(3)}' for i in range(1, 301)]

for col in numeric_features:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')

train[numeric_features] = train[numeric_features].fillna(0)
test[numeric_features] = test[numeric_features].fillna(0)

X_train = train[numeric_features].values.astype(np.float32)
X_test = test[numeric_features].values.astype(np.float32)
y_train = train['salary_category_encoded'].values.astype(np.int64)

class SalaryDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(-1)
        self.y = torch.tensor(y, dtype=torch.long) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

train_dataset = SalaryDataset(X_train, y_train)
test_dataset = SalaryDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

class CNN1DModel(nn.Module):
    def __init__(self, input_length, num_classes):
        super(CNN1DModel, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2)

        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=2)

        self.flatten = nn.Flatten()

        self.fc1 = nn.Linear(128 * (input_length // 4), 128)
        #self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)

        x = F.relu(self.conv1(x))
        x = self.pool1(x)

        x = F.relu(self.conv2(x))
        x = self.pool2(x)

        x = self.flatten(x)

        x = F.relu(self.fc1(x))
        #x = self.dropout(x)
        x = self.fc2(x)

        return x

input_length = X_train.shape[1]
num_classes = len(np.unique(y_train))
model = CNN1DModel(input_length, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.000515)

def train_model(model, train_loader, criterion, optimizer, num_epochs=250):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        all_labels = []
        all_preds = []
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
        epoch_accuracy = accuracy_score(all_labels, all_preds)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}, Accuracy: {epoch_accuracy:.4f}")

train_model(model, train_loader, criterion, optimizer, num_epochs=230)

# def evaluate_model(model, test_loader, has_labels=True):
#     model.eval()
#     all_labels = []
#     all_preds = []

#     with torch.no_grad():
#         for batch in test_loader:
#             if has_labels:
#                 inputs, labels = batch
#             else:
#                 inputs = batch
#                 labels = None

#             outputs = model(inputs)

#             probs = torch.softmax(outputs, dim=1)
#             _, preds = torch.argmax(probs, 1)

#             if has_labels:
#                 all_labels.extend(labels.cpu().numpy())
#             all_preds.extend(preds.cpu().numpy())

#     if has_labels:
#         test_accuracy = accuracy_score(all_labels, all_preds)
#         print(f"Test Accuracy: {test_accuracy:.4f}")
#     else:
#         print("No labels provided for evaluation.")

# evaluate_model(model, test_loader, has_labels = False)

def predict(model, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs in test_loader:
            outputs = model(inputs)
            pred_probab = nn.Softmax(dim=1)(outputs)
            preds = pred_probab.argmax(1)
            predictions.extend(preds.cpu().numpy())
    return predictions

predictions = predict(model, test_loader)

le = LabelEncoder()
le.fit(['Low', 'Medium', 'High'])

predictions = np.clip(predictions, 0, len(le.classes_) - 1)

predicted_salary_category = le.inverse_transform(predictions)

submission = pd.DataFrame({'obs': test['obs'], 'salary_category': predicted_salary_category})
submission.to_csv('submission.csv', index=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score


class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        #self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        #x = self.dropout(x)
        x = self.fc2(x)
        return x

class SalaryDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

train_dataset = SalaryDataset(X_train, y_train)
test_dataset = SalaryDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=312, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=312, shuffle=False)

input_size = X_train.shape[1]
hidden_size = 128
num_classes = len(np.unique(y_train))

model = MLPModel(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00525)

def train_model(model, train_loader, criterion, optimizer, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        all_labels = []
        all_preds = []

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim = 1)
            _, preds = torch.max(probs, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

        epoch_accuracy = accuracy_score(all_labels, all_preds)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}, Accuracy: {epoch_accuracy:.4f}")

train_model(model, train_loader, criterion, optimizer, num_epochs=100)

def predict(model, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs in test_loader:
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim = 1)
            _, preds = torch.max(probs, 1)
            predictions.extend(preds.cpu().numpy())
    return predictions

predictions = predict(model, test_loader)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(['Low', 'Medium', 'High'])
predicted_salary_category = le.inverse_transform(predictions)

submission = pd.DataFrame({'obs': test['obs'], 'salary_category': predicted_salary_category})
submission.to_csv('submission.csv', index=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import optuna

class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class SalaryDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

X_train_original = X_train
y_train_original = y_train

X_train, X_val, y_train, y_val = train_test_split(
    X_train_original,
    y_train_original,
    test_size=0.2,
    random_state=42,
    stratify=y_train_original
)

input_size = X_train.shape[1]
num_classes = len(np.unique(y_train))

train_dataset = SalaryDataset(X_train, y_train)
val_dataset = SalaryDataset(X_val, y_val)

def objective(trial):
    hidden_size = trial.suggest_int("hidden_size", 64, 256)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-3)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 312])
    num_epochs = trial.suggest_int("num_epochs", 500, 2000)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = MLPModel(input_size=input_size, hidden_size=hidden_size, num_classes=num_classes)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim=1)
            _, preds = torch.max(probs, 1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

study = optuna.create_study(direction="maximize")

study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)
print("Best accuracy:", study.best_value)

**Пока лучшая модель - catboost**

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier


test = pd.read_csv('D:/PAC-Competition/engineers_salary_prediction/test.csv')

le = LabelEncoder()
test['job_title_encoded'] = le.fit_transform(test['job_title'])
test['job_state_encoded'] = le.fit_transform(test['job_state'])

test['job_posted_date'] = pd.to_datetime(test['job_posted_date'])
test['year'] = test['job_posted_date'].dt.year
test['month'] = test['job_posted_date'].dt.month

job_desc_columns = [f'job_desc_{str(i).zfill(3)}' for i in range(1, 301)]
numeric_features = [f'feature_{i}' for i in range(1, 13)] + job_desc_columns + ['job_title_encoded', 'job_state_encoded', 'year', 'month']

X_test = test[numeric_features]

for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

from_file = CatBoostClassifier()
from_file.load_model("catboost_model.cbm")

# Проверка признаков
# model_feature_names = from_file.get_feature_names()
# test_feature_names = X_test.columns.tolist()

# if test_feature_names == model_feature_names:
#     print("Feature names match!")
# else:
#     print("Feature names mismatch")
#     print("Missing features in X_test:", set(model_feature_names) - set(test_feature_names))
#     print("Extra features in X_test:", set(test_feature_names) - set(model_feature_names))

#     # Исправление несоответствий
#     X_test = X_test.reindex(columns=model_feature_names, fill_value=0)

predictions = from_file.predict(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(['Low', 'Medium', 'High'])
predicted_labels = label_encoder.inverse_transform(predictions)

submission = pd.DataFrame({'obs': test['obs'], 'salary_category': predicted_labels})
submission.to_csv('submission.csv', index=False)