In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
DATA_PATH = 'data/'
JOB_POSTINGS_PATH = DATA_PATH + 'gsearch_jobs.csv'
COMPANIES_PATH = DATA_PATH + 'companies.csv'
INDUSTRIES_PATH = DATA_PATH + 'industries.csv'
COMPANY_INDUSTRY_PATH = DATA_PATH + 'company_industries.csv'
EMPLOYEE_COUNTS_PATH = DATA_PATH + 'employee_counts.csv'
JOB_LEVEL_VERIFY_PATH = DATA_PATH + 'verify_job_level_extraction.csv'
PROCESSED_DATA_PATH = DATA_PATH + 'dataframe_after_preprocessing.csv'

### Baseline Logistic Regression

In [3]:
df_merged = pd.read_csv(PROCESSED_DATA_PATH)

In [4]:
df_merged.columns

Index(['title', 'company_name', 'location_0', 'location_1', 'location_2',
       'location_3', 'location_4', 'location_5', 'location_6', 'location_7',
       'via', 'description', 'extensions', 'job_id', 'thumbnail', 'posted_at',
       'schedule_type', 'work_from_home', 'salary', 'search_term', 'date_time',
       'search_location', 'commute_time', 'salary_pay', 'salary_rate',
       'salary_avg', 'salary_min', 'salary_max', 'salary_hourly',
       'salary_yearly', 'salary_standardized', 'description_tokens',
       'company_id', 'name', 'company_description', 'company_size', 'state',
       'country', 'city', 'zip_code', 'address', 'url', 'industry_0',
       'industry_1', 'industry_2', 'industry_3', 'industry_4', 'industry_5',
       'industry_6', 'Years_of_Experience', 'job_level', 'average_length',
       'num_skills', 'required_soft_skills'],
      dtype='object')

In [5]:
X = df_merged[['location_0', 'location_1', 'location_2','location_3', 'location_4', 'location_5', 
               'location_6', 'location_7','commute_time', 'company_size', 'industry_0',
               'industry_1', 'industry_2', 'industry_3', 'industry_4', 'industry_5',
               'industry_6', 'Years_of_Experience', 'average_length', 'num_skills']].fillna(0)
y = df_merged['salary_standardized']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=699)

import statsmodels.api as sm 
x2 = sm.add_constant(X_train)
model = sm.OLS(y_train, x2).fit()
print(model.summary())

                             OLS Regression Results                            
Dep. Variable:     salary_standardized   R-squared:                       0.092
Model:                             OLS   Adj. R-squared:                  0.091
Method:                  Least Squares   F-statistic:                     165.7
Date:                 Mon, 04 Nov 2024   Prob (F-statistic):               0.00
Time:                         10:59:22   Log-Likelihood:            -3.3617e+05
No. Observations:                29529   AIC:                         6.724e+05
Df Residuals:                    29510   BIC:                         6.725e+05
Df Model:                           18                                         
Covariance Type:             nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
location_0           2.0

In [6]:
y_pred = model.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score, mean_squared_error

# Calculate MSE (for regression or if needed as a performance metric)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 475988673.88508666


In [8]:
# %pip install transformers

In [9]:
# %pip install torch

In [10]:
# build a BERT model for job descriptions
from transformers import BertTokenizer, BertModel
import torch
    
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
if torch.cuda.is_available():
    print ('cuda')
else:
    print('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Tokenize the job descriptions
encoded_inputs = tokenizer(df_merged['description'].tolist(), padding=True, truncation=True, return_tensors='pt')
input_ids = encoded_inputs['input_ids'].to(device)
attention_mask = encoded_inputs['attention_mask'].to(device)
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

cuda


OutOfMemoryError: CUDA out of memory. Tried to allocate 57.67 GiB. GPU 0 has a total capacity of 15.77 GiB of which 14.70 GiB is free. Including non-PyTorch memory, this process has 1.06 GiB memory in use. Of the allocated memory 726.73 MiB is allocated by PyTorch, and 53.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
bert_embeddings_df = pd.DataFrame(embeddings, index=df_merged.index)

X_combined = pd.concat([X, bert_embeddings_df], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.25, random_state=699)

x2 = sm.add_constant(X_train)
model = sm.OLS(y_train, x2).fit()
print(model.summary())