In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/")

In [None]:
import pandas as pd

In [None]:
df = pd.read_parquet("jobs_transfer_temp.parquet")

In [None]:
!pip install sentence-transformers pandas tqdm



In [None]:
!nvidia-smi

Mon Nov 24 06:22:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import csv
import torch
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

from sklearn.decomposition import PCA
import pyarrow as pa

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
print("Running on:", device)

Running on: cuda


In [None]:
df_titles = df[["job_link", "job_title"]]

In [None]:
df_titles

Unnamed: 0,job_link,job_title
0,https://ca.linkedin.com/jobs/view/team-lead-se...,Team Lead - Service Desk
1,https://www.linkedin.com/jobs/view/i-e-designe...,I&E Designer
2,https://www.linkedin.com/jobs/view/client-rela...,Client Relations Manager
3,https://www.linkedin.com/jobs/view/private-dut...,Private Duty Nurse (LPN) at Aveanna
4,https://www.linkedin.com/jobs/view/procurement...,Procurement Lead
...,...,...
1348336,https://www.linkedin.com/jobs/view/manager-tra...,MANAGER TRANSPLANT QUALITY/ PI
1348337,https://www.linkedin.com/jobs/view/sanitation-...,Sanitation - 3rd Shift Seasonal Part Time
1348338,https://www.linkedin.com/jobs/view/table-games...,Table Games Floor Supervisor
1348339,https://www.linkedin.com/jobs/view/pediatric-c...,Pediatric Cardiologist needed to join our grow...


In [None]:
output_path = "job_title_embeddings.parquet"

In [None]:
batch_size = 4096

titles = df_titles["job_title"].astype(str).tolist()
links = df_titles["job_link"].tolist()

In [None]:
writer = None

for start in tqdm(range(0, len(titles), batch_size), desc="Encoding batches"):
    batch_titles = titles[start:start+batch_size]
    batch_links = links[start:start+batch_size]

    with torch.no_grad():
        emb = model.encode(
            batch_titles,
            batch_size=batch_size,
            convert_to_tensor=True,
            device=device,
            show_progress_bar=False
        )

    batch_embeddings = emb.cpu().numpy().tolist()
    del emb
    torch.cuda.empty_cache()

    batch_df = pd.DataFrame({
        "job_link": batch_links,
        "title_embedding": batch_embeddings
    })

    table = pa.Table.from_pandas(batch_df)

    if writer is None:
        writer = pq.ParquetWriter(output_path, table.schema)
    writer.write_table(table)

if writer:
    writer.close()

print("Done! Embeddings saved to:", output_path)

Encoding batches: 100%|██████████| 330/330 [11:12<00:00,  2.04s/it]

Done! Embeddings saved to: job_title_embeddings.parquet





In [None]:
df = pd.read_parquet("job_title_embeddings.parquet")

In [None]:
len(df.iloc[0]["title_embedding"].split(','))

384

In [None]:


file_path = "job_title_embeddings.parquet"

parquet_file = pq.ParquetFile(file_path)

num_rows = parquet_file.metadata.num_rows
print("Number of rows in the Parquet file:", num_rows)

Number of rows in the Parquet file: 1348341


In [None]:
file_path = "job_title_embeddings.parquet"
output_path = "job_title_embeddings_pca32.parquet"

# Parameters
sample_size = 50000
batch_size = 1024 
pca_dim = 32

In [None]:
parquet_file = pq.ParquetFile(file_path)
sample_embeddings = []

In [None]:
for batch in parquet_file.iter_batches(batch_size=batch_size, columns=["title_embedding"]):
    batch_df = batch.to_pandas()
    sample_embeddings.extend(batch_df["title_embedding"].values)
    if len(sample_embeddings) >= sample_size:
        sample_embeddings = sample_embeddings[:sample_size]
        break

sample_embeddings = np.vstack(sample_embeddings)
print("Sample shape for PCA fit:", sample_embeddings.shape)

Sample shape for PCA fit: (50000, 384)


In [None]:
pca = PCA(n_components=pca_dim, random_state=42)
pca.fit(sample_embeddings)
print("PCA fitted on sample.")

PCA fitted on sample.


In [None]:
del sample_embeddings

In [None]:
from tqdm import tqdm

In [None]:
parquet_file = pq.ParquetFile(file_path)
num_rows = parquet_file.metadata.num_rows
total_batches = (num_rows + batch_size - 1) // batch_size

In [None]:
writer = None
for batch in parquet_file.iter_batches(batch_size=batch_size, columns=["job_link", "title_embedding"]):
    batch_df = batch.to_pandas()

    emb_array = np.vstack(batch_df["title_embedding"].values)
    emb_reduced = pca.transform(emb_array)

    batch_df["title_embedding"] = emb_reduced.tolist()

    table = pa.Table.from_pandas(batch_df)
    if writer is None:
        writer = pq.ParquetWriter(output_path, table.schema)
    writer.write_table(table)

    del batch_df, emb_array, emb_reduced

if writer:
    writer.close()

print("Done! PCA-reduced embeddings saved to:", output_path)

Transforming batches with PCA: 100%|██████████| 1317/1317 [01:13<00:00, 17.80batch/s]

Done! PCA-reduced embeddings saved to: job_title_embeddings_pca32.parquet





In [None]:


parquet_file = "job_title_embeddings_pca32.parquet"

pf = pq.ParquetFile(parquet_file)

columns = pf.schema.names
print("Columns in the Parquet file:", columns)

Columns in the Parquet file: ['job_link', 'element']
