In [1]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split




In [2]:
dataset = pd.read_csv('glassdoor_jobs.csv')

In [3]:
dataset.head()
dataset.keys()

Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors'],
      dtype='object')

In [4]:
filtered_dataset = dataset[['Job Title', 'Salary Estimate', 'Rating', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue','Location']]

filtered_dataset

Unnamed: 0,Job Title,Salary Estimate,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,Location
0,Data Scientist,$53K-$91K (Glassdoor est.),3.8,501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),"Albuquerque, NM"
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),3.4,10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),"Linthicum, MD"
2,Data Scientist,$80K-$90K (Glassdoor est.),4.8,501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),"Clearwater, FL"
3,Data Scientist,$56K-$97K (Glassdoor est.),3.8,1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Richland, WA"
4,Data Scientist,$86K-$143K (Glassdoor est.),2.9,51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"New York, NY"
...,...,...,...,...,...,...,...,...,...,...
951,Senior Data Engineer,$72K-$133K (Glassdoor est.),4.4,1001 to 5000 employees,2006,Company - Public,Internet,Information Technology,$100 to $500 million (USD),"Nashville, TN"
952,"Project Scientist - Auton Lab, Robotics Institute",$56K-$91K (Glassdoor est.),2.6,501 to 1000 employees,1984,College / University,Colleges & Universities,Education,Unknown / Non-Applicable,"Pittsburgh, PA"
953,Data Science Manager,$95K-$160K (Glassdoor est.),3.2,1 to 50 employees,-1,Company - Private,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),"Allentown, PA"
954,Data Engineer,-1,4.8,201 to 500 employees,2015,Company - Private,IT Services,Information Technology,$25 to $50 million (USD),"Austin, TX"


In [5]:
import re

def parse_salary(s):
    # Step 1: remove the "(Glassdoor est.)" part
    s = re.sub(r'\(.*?\)', '', s)
    s = re.sub (r'[\$K]', '', s).strip()
    s = s.replace('–', '-').replace('—', '-')  
    # Step 2: split by '-'
    try:
        min_sal, max_sal = map(int, s.split('-'))
        avg_sal = (min_sal + max_sal) / 2
        return avg_sal
    except:
        return None  # return None if parsing fails

# Apply to the column
filtered_dataset['Average Salary'] = filtered_dataset['Salary Estimate'].apply(parse_salary)

# Drop rows with errors (None)
filtered_dataset = filtered_dataset.dropna(subset=['Average Salary'])
filtered_dataset.drop(columns = ['Salary Estimate'], inplace = True)
filtered_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['Average Salary'] = filtered_dataset['Salary Estimate'].apply(parse_salary)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset.drop(columns = ['Salary Estimate'], inplace = True)


Unnamed: 0,Job Title,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,Location,Average Salary
0,Data Scientist,3.8,501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),"Albuquerque, NM",72.0
1,Healthcare Data Scientist,3.4,10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),"Linthicum, MD",87.5
2,Data Scientist,4.8,501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),"Clearwater, FL",85.0
3,Data Scientist,3.8,1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Richland, WA",76.5
4,Data Scientist,2.9,51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"New York, NY",114.5
...,...,...,...,...,...,...,...,...,...,...
950,"Sr Scientist, Immuno-Oncology - Oncology",3.9,10000+ employees,1830,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10+ billion (USD),"Cambridge, MA",84.5
951,Senior Data Engineer,4.4,1001 to 5000 employees,2006,Company - Public,Internet,Information Technology,$100 to $500 million (USD),"Nashville, TN",102.5
952,"Project Scientist - Auton Lab, Robotics Institute",2.6,501 to 1000 employees,1984,College / University,Colleges & Universities,Education,Unknown / Non-Applicable,"Pittsburgh, PA",73.5
953,Data Science Manager,3.2,1 to 50 employees,-1,Company - Private,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),"Allentown, PA",127.5


In [6]:
filtered_dataset['Company Age'] = 2025 - filtered_dataset['Founded'] 
filtered_dataset.drop(columns = ['Founded'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['Company Age'] = 2025 - filtered_dataset['Founded']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset.drop(columns = ['Founded'], inplace = True)


In [7]:
filtered_dataset

Unnamed: 0,Job Title,Rating,Size,Type of ownership,Industry,Sector,Revenue,Location,Average Salary,Company Age
0,Data Scientist,3.8,501 to 1000 employees,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),"Albuquerque, NM",72.0,52
1,Healthcare Data Scientist,3.4,10000+ employees,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),"Linthicum, MD",87.5,41
2,Data Scientist,4.8,501 to 1000 employees,Company - Private,Security Services,Business Services,$100 to $500 million (USD),"Clearwater, FL",85.0,15
3,Data Scientist,3.8,1001 to 5000 employees,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Richland, WA",76.5,60
4,Data Scientist,2.9,51 to 200 employees,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"New York, NY",114.5,27
...,...,...,...,...,...,...,...,...,...,...
950,"Sr Scientist, Immuno-Oncology - Oncology",3.9,10000+ employees,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10+ billion (USD),"Cambridge, MA",84.5,195
951,Senior Data Engineer,4.4,1001 to 5000 employees,Company - Public,Internet,Information Technology,$100 to $500 million (USD),"Nashville, TN",102.5,19
952,"Project Scientist - Auton Lab, Robotics Institute",2.6,501 to 1000 employees,College / University,Colleges & Universities,Education,Unknown / Non-Applicable,"Pittsburgh, PA",73.5,41
953,Data Science Manager,3.2,1 to 50 employees,Company - Private,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),"Allentown, PA",127.5,2026


In [8]:
#Improve some input features by modifying it into different classes for better understanding
size_map = {'1 to 50 employees' : 1,
            '51 to 200 employees' : 2,
            '201 to 500 employees' : 3,
            '501 to 1000 employees' : 4,
            '1001 to 5000 employees' : 5,
            '5001 to 10000 employees' : 6,
            '10000+ employees' : 7,
           }
filtered_dataset['Size Level']= filtered_dataset['Size'].map(size_map)
filtered_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['Size Level']= filtered_dataset['Size'].map(size_map)


Unnamed: 0,Job Title,Rating,Size,Type of ownership,Industry,Sector,Revenue,Location,Average Salary,Company Age,Size Level
0,Data Scientist,3.8,501 to 1000 employees,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),"Albuquerque, NM",72.0,52,4.0
1,Healthcare Data Scientist,3.4,10000+ employees,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),"Linthicum, MD",87.5,41,7.0
2,Data Scientist,4.8,501 to 1000 employees,Company - Private,Security Services,Business Services,$100 to $500 million (USD),"Clearwater, FL",85.0,15,4.0
3,Data Scientist,3.8,1001 to 5000 employees,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Richland, WA",76.5,60,5.0
4,Data Scientist,2.9,51 to 200 employees,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"New York, NY",114.5,27,2.0
...,...,...,...,...,...,...,...,...,...,...,...
950,"Sr Scientist, Immuno-Oncology - Oncology",3.9,10000+ employees,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10+ billion (USD),"Cambridge, MA",84.5,195,7.0
951,Senior Data Engineer,4.4,1001 to 5000 employees,Company - Public,Internet,Information Technology,$100 to $500 million (USD),"Nashville, TN",102.5,19,5.0
952,"Project Scientist - Auton Lab, Robotics Institute",2.6,501 to 1000 employees,College / University,Colleges & Universities,Education,Unknown / Non-Applicable,"Pittsburgh, PA",73.5,41,4.0
953,Data Science Manager,3.2,1 to 50 employees,Company - Private,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),"Allentown, PA",127.5,2026,1.0


In [9]:
#Map the revenue of the comapny to levels
revenue_map = {'Less than $1 million (USD)': 1,
                '$1 to $5 million (USD)': 2,
                '$5 to $10 million (USD)': 3,
                '$10 to $25 million (USD)': 4,
                '$25 to $50 million (USD)': 5,
                '$50 to $100 million (USD)': 6,
                '$100 to $500 million (USD)': 7,
                '$500 million to $1 billion (USD)': 8,
                '$1 to $2 billion (USD)': 9,
                '$2 to $5 billion (USD)': 10,
                '$5 to $10 billion (USD)': 11,
                '$10+ billion (USD)': 12,
              }
filtered_dataset['Revenue Level']= filtered_dataset['Revenue'].map(revenue_map)
filtered_dataset
                


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['Revenue Level']= filtered_dataset['Revenue'].map(revenue_map)


Unnamed: 0,Job Title,Rating,Size,Type of ownership,Industry,Sector,Revenue,Location,Average Salary,Company Age,Size Level,Revenue Level
0,Data Scientist,3.8,501 to 1000 employees,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),"Albuquerque, NM",72.0,52,4.0,6.0
1,Healthcare Data Scientist,3.4,10000+ employees,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),"Linthicum, MD",87.5,41,7.0,10.0
2,Data Scientist,4.8,501 to 1000 employees,Company - Private,Security Services,Business Services,$100 to $500 million (USD),"Clearwater, FL",85.0,15,4.0,7.0
3,Data Scientist,3.8,1001 to 5000 employees,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Richland, WA",76.5,60,5.0,8.0
4,Data Scientist,2.9,51 to 200 employees,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"New York, NY",114.5,27,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
950,"Sr Scientist, Immuno-Oncology - Oncology",3.9,10000+ employees,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10+ billion (USD),"Cambridge, MA",84.5,195,7.0,12.0
951,Senior Data Engineer,4.4,1001 to 5000 employees,Company - Public,Internet,Information Technology,$100 to $500 million (USD),"Nashville, TN",102.5,19,5.0,7.0
952,"Project Scientist - Auton Lab, Robotics Institute",2.6,501 to 1000 employees,College / University,Colleges & Universities,Education,Unknown / Non-Applicable,"Pittsburgh, PA",73.5,41,4.0,
953,Data Science Manager,3.2,1 to 50 employees,Company - Private,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),"Allentown, PA",127.5,2026,1.0,3.0


In [10]:
print(filtered_dataset.isna().sum())

Job Title              0
Rating                 0
Size                   0
Type of ownership      0
Industry               0
Sector                 0
Revenue                0
Location               0
Average Salary         0
Company Age            0
Size Level             1
Revenue Level        186
dtype: int64


In [11]:
filtered_dataset['Size Level'] = filtered_dataset['Size Level'].fillna(filtered_dataset['Size Level'].median())
filtered_dataset['Revenue Level'] = filtered_dataset['Revenue Level'].fillna(filtered_dataset['Revenue Level'].median())
print(filtered_dataset.isna().sum())

Job Title            0
Rating               0
Size                 0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Location             0
Average Salary       0
Company Age          0
Size Level           0
Revenue Level        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['Size Level'] = filtered_dataset['Size Level'].fillna(filtered_dataset['Size Level'].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['Revenue Level'] = filtered_dataset['Revenue Level'].fillna(filtered_dataset['Revenue Level'].median())


In [12]:
def extract_seniority(t):
    t = t.lower()
    if 'principal' in t or 'lead' in t or 'staff' in t:
        return 'Lead'
    if 'sr' in t or 'senior' in t:
        return 'Senior'
    if 'jr' in t or 'junior' in t:
        return 'Junior'
    return 'Mid'

filtered_dataset['Seniority'] = filtered_dataset['Job Title'].apply(extract_seniority)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['Seniority'] = filtered_dataset['Job Title'].apply(extract_seniority)


In [13]:
from sentence_transformers import SentenceTransformer

bert_model = SentenceTransformer('all-mpnet-base-v2')
job_embeddings = bert_model.encode(filtered_dataset['Job Title'].tolist(), normalize_embeddings=True)

# Convert to DataFrame
import pandas as pd
job_embeddings_df = pd.DataFrame(job_embeddings)

job_embeddings_df

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.003996,0.064525,-0.065325,-0.005707,-0.012976,0.032028,0.040819,-0.012118,-0.013949,0.060078,...,0.052042,-0.027856,-0.040561,0.041719,-0.027175,0.076662,0.020059,0.033966,0.042753,-0.039662
1,-0.002351,0.073236,-0.054983,-0.054750,0.006135,0.040137,0.039792,0.014446,0.027097,0.064827,...,0.038080,-0.023885,-0.032955,0.026706,-0.047630,0.067715,0.006165,0.005029,0.034864,-0.052554
2,-0.003995,0.064525,-0.065325,-0.005707,-0.012976,0.032028,0.040819,-0.012118,-0.013949,0.060078,...,0.052042,-0.027856,-0.040561,0.041719,-0.027175,0.076662,0.020059,0.033966,0.042753,-0.039662
3,-0.003995,0.064525,-0.065325,-0.005707,-0.012976,0.032028,0.040819,-0.012118,-0.013949,0.060078,...,0.052042,-0.027856,-0.040561,0.041719,-0.027175,0.076662,0.020059,0.033966,0.042753,-0.039662
4,-0.003995,0.064525,-0.065325,-0.005707,-0.012976,0.032028,0.040819,-0.012118,-0.013949,0.060078,...,0.052042,-0.027856,-0.040561,0.041719,-0.027175,0.076662,0.020059,0.033966,0.042753,-0.039662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0.070327,-0.039134,0.011586,-0.020596,-0.007332,-0.010105,0.036706,0.002519,0.031061,0.033038,...,0.018424,0.025882,-0.007724,0.024694,-0.056473,-0.003680,0.033717,-0.009442,-0.030619,-0.056414
700,0.004780,0.048072,-0.022828,-0.028246,0.008165,0.048306,0.032253,0.006446,0.008866,-0.013401,...,0.046132,-0.017591,-0.025912,0.013758,-0.035463,0.085362,-0.023315,0.019369,0.043586,-0.041784
701,-0.020478,-0.021007,-0.010254,0.012949,-0.030999,-0.043736,0.028405,-0.013561,-0.054537,-0.035538,...,-0.016584,-0.014846,0.050560,-0.031211,0.025405,0.031928,0.056019,0.011020,-0.008634,-0.050646
702,0.010261,0.043573,-0.047004,-0.002549,0.047585,0.051677,0.038930,-0.018724,-0.031455,0.078866,...,0.016203,-0.028077,-0.037826,0.016434,-0.040457,0.039742,-0.003149,0.013040,-0.037254,-0.015066


In [14]:
numerical_col = ['Rating', 'Company Age']
scaled_numerical_col = pd.concat([
    job_embeddings_df,
    filtered_dataset[['Size Level', 'Revenue Level']].reset_index(drop=True)
], axis=1)
categorical_col = ['Job Title', 'Type of ownership', 'Industry', 'Sector', 'Location']

In [15]:
# Select categorical + numerical columns from the main dataset
X_base = filtered_dataset[categorical_col + numerical_col].reset_index(drop=True)

# Concatenate with scaled_numerical_col (which includes BERT + ordinal cols)
X = pd.concat([X_base, scaled_numerical_col], axis=1)

# Target
y = filtered_dataset['Average Salary'].reset_index(drop=True)
X.columns = X.columns.astype(str)


In [16]:
from sklearn.preprocessing import StandardScaler
processed_dataset = ColumnTransformer(transformers = 
                                      [('cat', OneHotEncoder(handle_unknown = 'ignore'), 
                                       categorical_col), ('num', StandardScaler(), numerical_col)], remainder = 'passthrough')

In [17]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =0)

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [19]:
model = Pipeline([('preprocessor', processed_dataset),
                ('regressor', Ridge())])

In [20]:
param_grid = {
    'regressor__alpha': [0.1, 0.5, 1, 5, 10, 20, 50]
}

In [21]:
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(model,param_grid, cv=5, scoring='r2')

In [22]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [23]:
y_pred = model.predict(X_test)

In [24]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print('R^2:' , r2)

R^2: 0.7545179069476222


In [25]:
from sklearn.model_selection import cross_val_score
import numpy as np
r2_scores= cross_val_score(model, X, y, cv=5, scoring='r2')
r2_scores,np.mean(r2_scores)


(array([0.5807942 , 0.51295594, 0.86306594, 0.86991553, 0.71418203]),
 np.float64(0.7081827258780617))

In [26]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

In [27]:
non_numeric_cols = X.select_dtypes(include=['object']).columns
non_numeric_cols = non_numeric_cols.drop(['Job Title'])
print("Non-numeric columns:", non_numeric_cols)

ohe_non_numeric = pd.get_dummies(filtered_dataset[non_numeric_cols].fillna('Unknown'), drop_first=True)
ohe_non_numeric 

Non-numeric columns: Index(['Type of ownership', 'Industry', 'Sector', 'Location'], dtype='object')


Unnamed: 0,Type of ownership_Company - Private,Type of ownership_Company - Public,Type of ownership_Government,Type of ownership_Hospital,Type of ownership_Nonprofit Organization,Type of ownership_Other Organization,Type of ownership_School / School District,Type of ownership_Subsidiary or Business Segment,Type of ownership_Unknown,Industry_Accounting,...,"Location_Vancouver, WA","Location_Waltham, MA","Location_Washington, DC","Location_West Palm Beach, FL","Location_Westlake, OH","Location_Winston-Salem, NC","Location_Winter Park, FL","Location_Woburn, MA","Location_Woodbridge, NJ","Location_Worcester, MA"
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
951,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
952,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
953,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [28]:
print(job_embeddings_df.shape, job_embeddings_df.dtypes.unique())
print(job_embeddings_df.head(2))
# All dtypes should be float32/float64 (not object)


(704, 768) [dtype('float32')]
        0         1         2         3         4         5         6    \
0 -0.003996  0.064525 -0.065325 -0.005707 -0.012976  0.032028  0.040819   
1 -0.002351  0.073236 -0.054983 -0.054750  0.006135  0.040137  0.039792   

        7         8         9    ...       758       759       760       761  \
0 -0.012118 -0.013949  0.060078  ...  0.052042 -0.027856 -0.040561  0.041719   
1  0.014446  0.027097  0.064827  ...  0.038080 -0.023885 -0.032955  0.026706   

        762       763       764       765       766       767  
0 -0.027175  0.076662  0.020059  0.033966  0.042753 -0.039662  
1 -0.047630  0.067715  0.006165  0.005029  0.034864 -0.052554  

[2 rows x 768 columns]


In [29]:
numerical_col = filtered_dataset[['Rating', 'Company Age']]

In [30]:
X_rf = pd.concat([scaled_numerical_col.reset_index(drop=True),
                  numerical_col.reset_index(drop=True),
                  ohe_non_numeric .reset_index(drop=True),
                 job_embeddings_df.reset_index(drop=True)], axis=1)
X_rf.columns = X_rf.columns.astype(str)
print("object dtypes:", X_rf.select_dtypes(include='object').columns.tolist())


object dtypes: []


In [31]:
#Train test split
y_rf = filtered_dataset['Average Salary']  #target col
X_train, X_test, y_train, y_test = train_test_split(
    X_rf, y_rf, test_size=0.2, random_state=0
)


In [32]:
rf = RandomForestRegressor(
    n_estimators=900,
    max_depth= 20,
    min_samples_leaf=3,
    max_features='sqrt',
    random_state=0
)
rf.fit(X_train, y_train)



In [33]:
y_pred = rf.predict(X_test)

In [34]:
r2 = r2_score(y_test, y_pred)
print('R^2:' , r2)

R^2: 0.617934744330742


In [35]:
from sklearn.model_selection import cross_val_score
import numpy as np
r2_scores= cross_val_score(rf, X_rf, y_rf, cv=5, scoring='r2')
r2_scores,np.mean(r2_scores)

(array([0.37161996, 0.4679328 , 0.74300415, 0.72139283, 0.65723613]),
 np.float64(0.5922371749158593))

In [36]:
from xgboost import XGBRegressor
xgb = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

In [37]:
from sklearn.model_selection import train_test_split
import pandas as pd


X_train, X_test, y_train, y_test = train_test_split(
    X_rf, y_rf, test_size=0.2, random_state=0
)

In [38]:
xgb.fit(X_train.values, y_train)

In [39]:
y_pred = xgb.predict(X_test.values)

In [40]:
r2 = r2_score(y_test, y_pred)
print('R^2:' , r2)

R^2: 0.6694083389633534


In [41]:
from sklearn.model_selection import cross_val_score
import numpy as np
r2_scores= cross_val_score(rf, X_rf, y_rf, cv=5, scoring='r2')
r2_scores,np.mean(r2_scores)

(array([0.37161996, 0.4679328 , 0.74300415, 0.72139283, 0.65723613]),
 np.float64(0.5922371749158593))