In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Load datasets
train = pd.read_csv("/kaggle/input/salary-prediction/train.csv")
test = pd.read_csv("/kaggle/input/salary-prediction/test.csv")
submission = pd.read_csv("/kaggle/input/salary-prediction/solution_format.csv")

In [7]:
# Drop obs from train, but keep in test for submission
train.dropna(subset=['salary_category'], inplace=True)

# Fill missing values
train['job_posted_date'].fillna('Unknown', inplace=True)
test['job_posted_date'].fillna('Unknown', inplace=True)

train['job_state'].fillna('Unknown', inplace=True)
test['job_state'].fillna('Unknown', inplace=True)

# Detect job_desc columns
job_desc_cols = [col for col in train.columns if col.startswith('job_desc_')]

# Add job_desc stats
def add_jobdesc_features(df):
    df['desc_sum'] = df[job_desc_cols].sum(axis=1)
    df['desc_mean'] = df[job_desc_cols].mean(axis=1)
    df['desc_std'] = df[job_desc_cols].std(axis=1)
    df['desc_nonzero'] = (df[job_desc_cols] != 0).sum(axis=1)
    return df

train = add_jobdesc_features(train)
test = add_jobdesc_features(test)

# PCA on job_descs
n_pca_components = min(30, len(job_desc_cols))
pca = PCA(n_components=n_pca_components, random_state=42)
pca.fit(pd.concat([train[job_desc_cols], test[job_desc_cols]], axis=0))

train_pca = pca.transform(train[job_desc_cols])
test_pca = pca.transform(test[job_desc_cols])

# Add PCA features
pca_cols = [f'pca_{i}' for i in range(n_pca_components)]
train[pca_cols] = train_pca
test[pca_cols] = test_pca

train.drop(columns=job_desc_cols, inplace=True)
test.drop(columns=job_desc_cols, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['job_posted_date'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['job_posted_date'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [8]:
# Combined categorical feature
train['title_state'] = train['job_title'].astype(str) + '_' + train['job_state'].astype(str)
test['title_state'] = test['job_title'].astype(str) + '_' + test['job_state'].astype(str)

# Frequency encoding
for col in ['job_title', 'job_state', 'title_state']:
    freq = train[col].value_counts().to_dict()
    train[f'{col}_freq'] = train[col].map(freq)
    test[f'{col}_freq'] = test[col].map(freq)

# Convert TRUE/FALSE to binary
for col in train.columns:
    if train[col].dtype == 'bool' or set(train[col].dropna().unique()) == {True, False}:
        train[col] = train[col].astype(int)
        test[col] = test[col].astype(int)

# Categorical encoding
cat_cols = ['feature_1', 'job_title', 'job_state', 'title_state']
for col in cat_cols:
    le = LabelEncoder()
    all_vals = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(all_vals)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Final data prep
drop_cols = ['obs', 'salary_category', 'job_posted_date']
features = [col for col in train.columns if col not in drop_cols]
X = train[features]
y = train['salary_category']
X_test = test[features]

# Encode target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)


In [9]:
model = LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.015,
    max_depth=14,
    num_leaves=110,
    lambda_l1=0.1,
    lambda_l2=5.0,
    class_weight='balanced',
    min_child_samples=30,
    subsample=0.85,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)


model.fit(X, y_encoded)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000832 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8864
[LightGBM] [Info] Number of data points in the train set: 1280, number of used features: 51
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [10]:
# Predict
preds = model.predict(X_test)
pred_labels = le_target.inverse_transform(preds)

# Prepare submission
submission['salary_category'] = pred_labels
submission.to_csv("solution_format.csv", index=False)

