# 3.3 Salary prediction

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import os
os.chdir("/content/drive/My Drive/DIC")

In [3]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import os
from tqdm.auto import tqdm
import polars as pl
import re

csv_path = "job_summary.csv"
parquet_path = "job_summary.parquet"
chunk_size = 50_000

total_bytes = os.path.getsize(csv_path)

bytes_read = 0
writer = None

with open(csv_path, 'rb') as f:
    header = f.readline()
    bytes_read += len(header)

    for chunk in tqdm(
        pd.read_csv(csv_path, chunksize=chunk_size),
        desc="Converting CSV → Parquet",
        dynamic_ncols=True,
        unit="chunk",
        miniters=1
    ):

        bytes_read = f.tell()

        table = pa.Table.from_pandas(chunk, preserve_index=False)

        if writer is None:
            writer = pq.ParquetWriter(parquet_path, table.schema)

        writer.write_table(table)

        tqdm.write(f"Progress: {bytes_read/total_bytes:.2%}   ETA: calculating...")  # optional

if writer:
    writer.close()

print("\nDone! CSV converted to Parquet efficiently.")

Converting CSV → Parquet: 0chunk [00:00, ?chunk/s]

Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...
Progress: 0.00%   ETA: calculating...

Done! CSV c

In [4]:
import pyarrow.parquet as pq
import pandas as pd
import polars as pl
import pyarrow as pa
from tqdm.auto import tqdm

pf = pq.ParquetFile("job_summary.parquet")

rows_needed = 2
rows_collected = []

for batch in pf.iter_batches(batch_size=1024):
    df = batch.to_pandas()
    for _, row in df.iterrows():
        rows_collected.append(row)
        if len(rows_collected) == rows_needed:
            break
    if len(rows_collected) == rows_needed:
        break

out_df = pd.DataFrame(rows_collected)
print(out_df)


                                            job_link  \
0  https://www.linkedin.com/jobs/view/restaurant-...   
1  https://www.linkedin.com/jobs/view/med-surg-re...   

                                         job_summary  
0  Rock N Roll Sushi is hiring a Restaurant Manag...  
1  Schedule\n: PRN is required minimum 12 hours p...  


In [8]:
!pip install polars --quiet

In [10]:
def convert(num: str):
    num = num.lower().replace(",", "").replace("$", "").strip()
    if num.endswith("k"):
        return float(num[:-1]) * 1000
    return float(num)

In [11]:
salary_regex = r"(?:(?:USD|\$)\s*)?(\d[\d,]*k?)\s*(?:-|–|to)\s*(?:USD|\$)?\s*(\d[\d,]*k?)"

In [15]:

input_parquet = "job_summary.parquet"
output_parquet = "job_summary_salary.parquet"

pf = pq.ParquetFile(input_parquet)
num_row_groups = pf.num_row_groups

salary_regex = r"(?:(?:USD|\$)\s*)?(\d[\d,]*k?)\s*(?:-|–|to)\s*(?:USD|\$)?\s*(\d[\d,]*k?)"

def convert_salary(num: str):
    if num is None:
        return None
    num = num.lower().replace(",", "").replace("$", "").strip()
    if num.endswith("k"):
        return float(num[:-1]) * 1000
    return float(num)

writer = None

for rg in tqdm(range(num_row_groups), desc="Extracting salary", dynamic_ncols=True):

    pa_table = pf.read_row_group(rg)
    df = pl.from_arrow(pa_table)

    df = df.with_columns([
        pl.col("job_summary").str.extract(salary_regex, group_index=1).alias("low"),
        pl.col("job_summary").str.extract(salary_regex, group_index=2).alias("high"),
    ])

    df = df.drop_nulls(subset=["low", "high"])

    if df.height == 0:
        continue

    df = df.with_columns([
        pl.col("low").map_elements(convert_salary, return_dtype=pl.Float64).alias("low_f"),
        pl.col("high").map_elements(convert_salary, return_dtype=pl.Float64).alias("high_f"),
    ])

    df = df.drop_nulls(subset=["low_f", "high_f"])

    if df.height == 0:
        continue

    df = df.with_columns(
        ((pl.col("low_f") + pl.col("high_f")) / 2).alias("salary_estimated")
    )

    df = df.select(["job_link", "salary_estimated"])

    if df.height == 0:
        continue

    arrow_table = df.to_arrow()

    if writer is None:
        writer = pq.ParquetWriter(output_parquet, arrow_table.schema)

    writer.write_table(arrow_table)

if writer:
    writer.close()

print("Done! Salary parquet saved to:", output_parquet)

Extracting salary:   0%|          | 0/26 [00:00<?, ?it/s]

Done! Salary parquet saved to: job_summary_salary.parquet


In [17]:
df = pd.read_parquet("job_summary_salary.parquet")

In [18]:
df

Unnamed: 0,job_link,salary_estimated
0,https://www.linkedin.com/jobs/view/med-surg-re...,12.5
1,https://uk.linkedin.com/jobs/view/commercial-a...,35000.0
2,https://www.linkedin.com/jobs/view/restaurant-...,20.5
3,https://www.linkedin.com/jobs/view/experienced...,59.0
4,https://www.linkedin.com/jobs/view/lead-materi...,17.5
...,...,...
542731,https://www.linkedin.com/jobs/view/principal-p...,1775.5
542732,https://www.linkedin.com/jobs/view/private-dut...,12.5
542733,https://www.linkedin.com/jobs/view/business-ap...,155.0
542734,https://www.linkedin.com/jobs/view/flight-qual...,714.0


In [19]:
def normalize_to_yearly(amount):
    if amount < 50:            # hourly
        return amount * 2080
    elif amount < 500:         # daily
        return amount * 260
    elif amount < 2000:        # weekly
        return amount * 52
    elif amount < 10000:       # monthly
        return amount * 12
    else:                      # yearly
        return amount

In [21]:
df["salary_yearly"] = df["salary_estimated"].apply(normalize_to_yearly)

In [23]:
df = df[(df["salary_yearly"] >= 10000) & (df["salary_yearly"] <= 200000)]

if "salary_estimated" in df.columns:
    df = df.drop(columns=["salary_estimated"])

df = df[["job_link", "salary_yearly"]]

df.head()

Unnamed: 0,job_link,salary_yearly
0,https://www.linkedin.com/jobs/view/med-surg-re...,26000.0
1,https://uk.linkedin.com/jobs/view/commercial-a...,35000.0
2,https://www.linkedin.com/jobs/view/restaurant-...,42640.0
3,https://www.linkedin.com/jobs/view/experienced...,15340.0
4,https://www.linkedin.com/jobs/view/lead-materi...,36400.0


In [24]:
output_path = "job_summary_salary.parquet"
df.to_parquet(output_path, index=False)
print("Saved to:", output_path)

Saved to: job_summary_salary.parquet


Neural Net

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import IterableDataset, DataLoader
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm.auto import tqdm
import pyarrow.parquet as pq
from torch.utils.data import IterableDataset, DataLoader
import random

In [9]:
salary_path = "job_summary_salary.parquet"

salary_df = pd.read_parquet(salary_path)
salary_dict = dict(zip(salary_df["job_link"], salary_df["salary_yearly"]))
VALID_LINKS = set(salary_dict.keys())

print("Loaded salary records:", len(VALID_LINKS))

Loaded salary records: 420290


In [10]:
def load_summary_embeddings(path, valid_links):
    pf = pq.ParquetFile(path)
    summary_map = {}

    for rg in tqdm(range(pf.num_row_groups), desc="Loading summary embeddings", dynamic_ncols=True):
        tbl = pf.read_row_group(rg)
        df = tbl.to_pandas()

        df = df[df["job_link"].isin(valid_links)]

        for j, e in zip(df["job_link"], df["element"]):
            summary_map[j] = np.array(e, dtype=np.float32)

    return summary_map


In [11]:
def load_title_embeddings(path, valid_links):
    pf = pq.ParquetFile(path)
    title_map = {}

    for rg in tqdm(range(pf.num_row_groups), desc="Loading title embeddings", dynamic_ncols=True):
        tbl = pf.read_row_group(rg)
        df = tbl.to_pandas()

        df = df[df["job_link"].isin(valid_links)]

        for j, e in zip(df["job_link"], df["title_embedding"]):
            title_map[j] = np.array(e, dtype=np.float32)

    return title_map


In [12]:
summary_path = "job_summary_embeddings_pca32.parquet"
title_path   = "job_title_embeddings_pca32.parquet"

summary_map = load_summary_embeddings(summary_path, VALID_LINKS)
title_map   = load_title_embeddings(title_path, VALID_LINKS)

print("Summary keys:", len(summary_map))
print("Title keys:", len(title_map))


Loading summary embeddings:   0%|          | 0/26 [00:00<?, ?it/s]

Loading title embeddings:   0%|          | 0/1317 [00:00<?, ?it/s]

Summary keys: 420290
Title keys: 420258


In [13]:
COMMON_LINKS = (
    set(summary_map.keys()) &
    set(title_map.keys()) &
    set(salary_dict.keys())
)

COMMON_LINKS = list(COMMON_LINKS)
print("Training rows:", len(COMMON_LINKS))


Training rows: 420258


In [37]:
def normalize_vec(x: np.ndarray) -> np.ndarray:
    x = x.astype(np.float32)
    m = x.mean()
    s = x.std()
    if s < 1e-6:
        return x - m
    return (x - m) / (s + 1e-6)

In [38]:
class SalaryDataset(IterableDataset):
    def __init__(self, summary_map, title_map, salary_dict, links, batch_size=4096):
        super().__init__()
        self.summary_map = summary_map
        self.title_map   = title_map
        self.salary_dict = salary_dict  # yearly salary
        self.links       = list(links)
        self.batch_size  = batch_size

    def __iter__(self):
        links = np.random.permutation(self.links)
        Xs, Xt, Ys = [], [], []

        for link in links:
            s_vec = normalize_vec(self.summary_map[link])
            t_vec = normalize_vec(self.title_map[link])
            salary = self.salary_dict[link]

            y_log = np.log1p(salary).astype(np.float32)

            Xs.append(s_vec)
            Xt.append(t_vec)
            Ys.append(y_log)

            if len(Xs) == self.batch_size:
                xb = np.concatenate([np.stack(Xs), np.stack(Xt)], axis=1).astype(np.float32)
                yb = np.array(Ys, dtype=np.float32).reshape(-1, 1)
                yield xb, yb
                Xs, Xt, Ys = [], [], []

        if len(Xs) > 0:
            xb = np.concatenate([np.stack(Xs), np.stack(Xt)], axis=1).astype(np.float32)
            yb = np.array(Ys, dtype=np.float32).reshape(-1, 1)
            yield xb, yb

In [39]:
train_dataset = SalaryDataset(summary_map, title_map, salary_dict, COMMON_LINKS, batch_size=4096)

In [41]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [43]:
class SalaryModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

model = SalaryModel().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.SmoothL1Loss()
scaler = torch.cuda.amp.GradScaler()

EPOCHS_DONE = 0

  scaler = torch.cuda.amp.GradScaler()


In [44]:
def train_epochs(num_epochs, dataset):
    global EPOCHS_DONE

    loader = DataLoader(dataset, batch_size=None)

    for _ in range(num_epochs):
        model.train()
        pbar = tqdm(loader, desc=f"Epoch {EPOCHS_DONE+1}", dynamic_ncols=True)

        for xb, yb in pbar:
            xb = torch.tensor(xb, device=device)
            yb = torch.tensor(yb, device=device)

            optimizer.zero_grad(set_to_none=True)

            with torch.amp.autocast(device_type="cuda"):
                pred_log = model(xb)
                pred_log = pred_log.clamp(7.0, 13.0)
                loss = loss_fn(pred_log, yb)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            pbar.set_postfix({"loss": float(loss)})

        EPOCHS_DONE += 1

In [45]:
train_epochs(5, train_dataset)

Epoch 1: 0it [00:00, ?it/s]

  xb = torch.tensor(xb, device=device)
  yb = torch.tensor(yb, device=device)


Epoch 2: 0it [00:00, ?it/s]

Epoch 3: 0it [00:00, ?it/s]

Epoch 4: 0it [00:00, ?it/s]

Epoch 5: 0it [00:00, ?it/s]

In [46]:
def eval_random_sample(N=5000):
    sample_links = random.sample(COMMON_LINKS, min(N, len(COMMON_LINKS)))

    X_eval, Y_true = [], []
    for link in sample_links:
        s_vec = normalize_vec(summary_map[link])
        t_vec = normalize_vec(title_map[link])
        salary = salary_dict[link]

        X_eval.append(np.concatenate([s_vec, t_vec]).astype(np.float32))
        Y_true.append(salary)

    X_eval = np.stack(X_eval)
    Y_true = np.array(Y_true, dtype=np.float64)

    model.eval()
    with torch.no_grad():
        xb = torch.tensor(X_eval, device=device)
        with torch.amp.autocast(device_type="cuda"):
            pred_log = model(xb)

    pred_log = pred_log.clamp(7.0, 13.0)
    pred_log_np = pred_log.cpu().numpy().reshape(-1)
    Y_pred = np.expm1(pred_log_np)

    mask = np.isfinite(Y_true) & np.isfinite(Y_pred)
    Y_true = Y_true[mask]
    Y_pred = Y_pred[mask]

    mae  = np.mean(np.abs(Y_true - Y_pred))
    rmse = np.sqrt(np.mean((Y_true - Y_pred)**2))
    ss_res = np.sum((Y_true - Y_pred)**2)
    ss_tot = np.sum((Y_true - np.mean(Y_true))**2)
    r2 = 1 - ss_res/ss_tot if ss_tot != 0 else 0
    mape = np.mean(np.abs((Y_true - Y_pred) / Y_true)) * 100

    return {
        "MAE": float(mae),
        "RMSE": float(rmse),
        "R2": float(r2),
        "MAPE (%)": float(mape),
    }

metrics = eval_random_sample(5000)
print(metrics)

{'MAE': 55916.4385, 'RMSE': 67870.09261020093, 'R2': -2.11302641492842, 'MAPE (%)': 96.78000243393225}


We tried a couple different models, and everywhere the model did not learn anything from the job summary and title. While the job summary itself can predict seniority, it seems to be prediciting worse than guessing average salary. Does this mean that title and summary (and seniority) have no role to play in salary (at least those jobs on linkedin)?