In [None]:
!pip install xgboost lightgbm joblib




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
import joblib


In [None]:
df = pd.read_csv("salary_data_cleaned.csv")
df.head()


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,avg_salary,company_txt,job_state,same_state,age,python_yn,R_yn,spark,aws,excel
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,72.0,Tecolote Research\n,NM,0,47,1,0,0,0,1
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,87.5,University of Maryland Medical System\n,MD,0,36,1,0,0,0,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,85.0,KnowBe4\n,FL,1,10,1,0,1,0,1
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,76.5,PNNL\n,WA,1,55,1,0,0,0,0
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,114.5,Affinity Solutions\n,NY,1,22,1,0,0,0,1


In [None]:
target = "avg_salary"

features = [
    "Job Title",
    "Rating",
    "Company Name",
    "Location",
    "Size",
    "Type of ownership",
    "Industry",
    "Sector",
    "Revenue",
    "company_txt",
    "job_state",
    "age",
    "min_salary",
    "max_salary",
    "hourly",
    "employer_provided",
    "same_state",
    "python_yn",
    "R_yn",
    "spark",
    "aws",
    "excel",
]


In [None]:
df = df.dropna(subset=[target])

X = df[features]
y = df[target]


In [None]:
num_features = X.select_dtypes(include=['int64','float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

num_features, cat_features


(Index(['Rating', 'age', 'min_salary', 'max_salary', 'hourly',
        'employer_provided', 'same_state', 'python_yn', 'R_yn', 'spark', 'aws',
        'excel'],
       dtype='object'),
 Index(['Job Title', 'Company Name', 'Location', 'Size', 'Type of ownership',
        'Industry', 'Sector', 'Revenue', 'company_txt', 'job_state'],
       dtype='object'))

In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_features),
    ("cat", categorical_pipeline, cat_features)
])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=300, random_state=42)
xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=42)

voting = VotingRegressor(
    estimators=[("rf", rf), ("gbr", gbr), ("xgb", xgb)]
)


In [None]:
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("voting", voting)
])


In [None]:
model_pipeline.fit(X_train, y_train)


In [None]:
from math import sqrt

mse = mean_squared_error(y_test, preds)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

rmse, mae, r2


(1.0632540388749032, 0.49797406650568943, 0.9993083503154373)

In [None]:
import joblib
joblib.dump(model_pipeline, "salary_prediction_model.pkl")
print("Model saved!")


Model saved!


In [None]:
from google.colab import files
files.download("salary_prediction_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>