In [None]:
# Importing Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/ST4035_DataScience/Group Project/v4_Latest_Data_Science_Salaries.csv')
data.head()

Unnamed: 0,Job Title,Employment Type,Experience Level,Expertise Level,Salary,Salary Currency,Company Location,Salary in USD,Employee Residence,Company Size,Year
0,AI Scientist,Full-Time,Senior,Expert,60000,Euro,Germany,64781,Germany,Large,2023
1,Data Engineer,Full-Time,Mid,Intermediate,160000,United States Dollar,United States,160000,United States,Medium,2023
2,Data Engineer,Full-Time,Mid,Intermediate,140000,United States Dollar,United States,140000,United States,Medium,2023
3,Data Engineer,Full-Time,Mid,Intermediate,139152,United States Dollar,United States,139152,United States,Large,2023
4,Data Engineer,Full-Time,Mid,Intermediate,82452,United States Dollar,United States,82452,United States,Large,2023


There isn't a globally standardized range for categorizing salaries into specific ranges, as salary levels can vary significantly based on factors such as country, industry, job role, and cost of living. What might be considered a high salary in one region could be quite different in another due to differences in economic conditions and purchasing power.

However, some organizations and agencies do provide general guidelines or salary benchmarks based on global or regional data. For example, the World Bank might provide income categories based on countries' gross national income (GNI) per capita, but these categories are often used for economic classification rather than precise salary ranges.

If you're looking for a standardized approach, you might consider using percentiles or quartiles based on your specific dataset. This way, you can categorize salaries relative to the distribution within your data, which could offer more meaningful insights for your analysis. Alternatively, you could consult salary surveys or reports specific to your industry or region to get a sense of typical salary ranges for different roles and experience levels.


In [None]:
data.isnull().sum()


Job Title             0
Employment Type       0
Experience Level      0
Expertise Level       0
Salary                0
Salary Currency       0
Company Location      0
Salary in USD         0
Employee Residence    0
Company Size          0
Year                  0
dtype: int64

In [None]:
data.duplicated().sum()

0

In [None]:
df = data.copy()

In [None]:
# # Define bin edges and category labels
# bin_edges = [0, 80000, 120000, 160000, 200000, float('inf')]
# category_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']

# # Create a new categorical variable based on Salary in USD
# salary_categories = pd.cut(df["Salary in USD"], bins=bin_edges, labels=category_labels, right=False)

# # Convert to an ordinal categorical data type
# salary_categories = pd.Categorical(salary_categories, categories=category_labels, ordered=True)

# # Add the ordinal categorical variable to the DataFrame
# df["Salary Category"] = salary_categories

# # Get frequency counts of each category
# category_counts = df["Salary Category"].value_counts()

# # Display the frequency counts
# print(category_counts)

# # Display the first few rows of the DataFrame with the new category column
# df.head()


In [None]:
# # Create a pie chart to visualize the distribution of ordinal salary categories
# plt.figure(figsize=(8, 8))
# plt.pie(category_counts, labels=category_counts.index, autopct="%1.1f%%", startangle=140, colors=sns.color_palette("Set3", len(category_counts)))
# plt.title("Distribution of Ordinal Salary Categories")
# plt.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.

# plt.show()

# # Display the first few rows of the DataFrame with the new category column
# df.head()

In [None]:
df['Employment Type'].value_counts()

Full-Time    3641
Contract       17
Part-Time      14
Freelance      11
Name: Employment Type, dtype: int64

In [None]:
df['Job Title'].value_counts()

Data Engineer                    771
Data Scientist                   697
Data Analyst                     501
Machine Learning Engineer        333
Analytics Engineer               154
                                ... 
Data DevOps Engineer               1
Data Engineer 2                    1
Analytics Engineering Manager      1
Sales Data Analyst                 1
Data Quality Engineer              1
Name: Job Title, Length: 116, dtype: int64

In [None]:
df['Expertise Level'].value_counts()

Expert          2318
Intermediate     887
Junior           316
Director         162
Name: Expertise Level, dtype: int64

In [None]:
df['Company Location'].value_counts()

United States         2791
United Kingdom         273
Canada                 135
Germany                 69
Spain                   52
                      ... 
Ecuador                  1
Andorra                  1
Korea, Republic of       1
Qatar                    1
Malta                    1
Name: Company Location, Length: 72, dtype: int64

In [None]:
df['Employee Residence'].value_counts()

United States     2746
United Kingdom     267
Canada             132
Germany             63
India               59
                  ... 
Cyprus               1
Kuwait               1
Georgia              1
South Africa         1
Malta                1
Name: Employee Residence, Length: 84, dtype: int64

In [None]:
df['Company Size'].value_counts()

Medium    3062
Large      467
Small      154
Name: Company Size, dtype: int64

In [None]:
df['Year'].value_counts()

2023    2376
2022    1019
2021     215
2020      73
Name: Year, dtype: int64

In [None]:
### Creating New Varaible "is_expatriate"

df['is_expatriate'] = df['Company Location'] != df['Employee Residence']
df['is_expatriate'] = df['is_expatriate'].map({True: 'Yes', False: 'No'})

In [None]:
df['is_expatriate'].value_counts()

No     3573
Yes     110
Name: is_expatriate, dtype: int64

In [None]:
### Lumping Low-frequency categroies

## Employee Residence

frequency_threshold = 50  # Adjust this threshold as needed

residence_counts = df['Employee Residence'].value_counts()

# Identify categories with low frequency
low_frequency_categories = residence_counts[residence_counts < frequency_threshold].index

# Create a new category name for low-frequency categories
new_category_name = 'Other'  # Adjust the new category name as desired

# Group low-frequency categories into the new category
df.loc[df['Employee Residence'].isin(low_frequency_categories), 'Employee Residence'] = new_category_name

# Display the first few rows of the updated DataFrame
print(df.head())

       Job Title Employment Type Experience Level Expertise Level  Salary  \
0   AI Scientist       Full-Time           Senior          Expert   60000   
1  Data Engineer       Full-Time              Mid    Intermediate  160000   
2  Data Engineer       Full-Time              Mid    Intermediate  140000   
3  Data Engineer       Full-Time              Mid    Intermediate  139152   
4  Data Engineer       Full-Time              Mid    Intermediate   82452   

        Salary Currency Company Location  Salary in USD Employee Residence  \
0                  Euro          Germany          64781            Germany   
1  United States Dollar    United States         160000      United States   
2  United States Dollar    United States         140000      United States   
3  United States Dollar    United States         139152      United States   
4  United States Dollar    United States          82452      United States   

  Company Size  Year is_expatriate  
0        Large  2023           

In [None]:
df['Employee Residence'].value_counts()

United States     2746
Other              361
United Kingdom     267
Canada             132
Germany             63
India               59
Spain               55
Name: Employee Residence, dtype: int64

In [None]:
## Company Location

frequency_threshold = 50  # Adjust this threshold as needed

comlocation_counts = df['Company Location'].value_counts()

# Identify categories with low frequency
low_frequency_categories = comlocation_counts[comlocation_counts < frequency_threshold].index

# Create a new category name for low-frequency categories
new_category_name = 'Other'  # Adjust the new category name as desired

# Group low-frequency categories into the new category
df.loc[df['Company Location'].isin(low_frequency_categories), 'Company Location'] = new_category_name

# Display the first few rows of the updated DataFrame
print(df.head())

       Job Title Employment Type Experience Level Expertise Level  Salary  \
0   AI Scientist       Full-Time           Senior          Expert   60000   
1  Data Engineer       Full-Time              Mid    Intermediate  160000   
2  Data Engineer       Full-Time              Mid    Intermediate  140000   
3  Data Engineer       Full-Time              Mid    Intermediate  139152   
4  Data Engineer       Full-Time              Mid    Intermediate   82452   

        Salary Currency Company Location  Salary in USD Employee Residence  \
0                  Euro          Germany          64781            Germany   
1  United States Dollar    United States         160000      United States   
2  United States Dollar    United States         140000      United States   
3  United States Dollar    United States         139152      United States   
4  United States Dollar    United States          82452      United States   

  Company Size  Year is_expatriate  
0        Large  2023           

In [None]:
df['Company Location'].value_counts()

United States     2791
Other              363
United Kingdom     273
Canada             135
Germany             69
Spain               52
Name: Company Location, dtype: int64

In [None]:
## Job Title

frequency_threshold = 50  # Adjust this threshold as needed

job_counts = df['Job Title'].value_counts()

# Identify categories with low frequency
low_frequency_categories = job_counts[job_counts < frequency_threshold].index

# Create a new category name for low-frequency categories
new_category_name = 'Other'  # Adjust the new category name as desired

# Group low-frequency categories into the new category
df.loc[df['Job Title'].isin(low_frequency_categories), 'Job Title'] = new_category_name

# Display the first few rows of the updated DataFrame
print(df.head())

       Job Title Employment Type Experience Level Expertise Level  Salary  \
0          Other       Full-Time           Senior          Expert   60000   
1  Data Engineer       Full-Time              Mid    Intermediate  160000   
2  Data Engineer       Full-Time              Mid    Intermediate  140000   
3  Data Engineer       Full-Time              Mid    Intermediate  139152   
4  Data Engineer       Full-Time              Mid    Intermediate   82452   

        Salary Currency Company Location  Salary in USD Employee Residence  \
0                  Euro          Germany          64781            Germany   
1  United States Dollar    United States         160000      United States   
2  United States Dollar    United States         140000      United States   
3  United States Dollar    United States         139152      United States   
4  United States Dollar    United States          82452      United States   

  Company Size  Year is_expatriate  
0        Large  2023           

In [None]:
df['Job Title'].value_counts()

Data Engineer                768
Other                        742
Data Scientist               693
Data Analyst                 496
Machine Learning Engineer    330
Analytics Engineer           154
Research Scientist           115
Data Architect                89
Research Engineer             69
ML Engineer                   65
Data Science Manager          61
Applied Scientist             59
Name: Job Title, dtype: int64

In [None]:
df.head(10)

Unnamed: 0,Job Title,Employment Type,Experience Level,Expertise Level,Salary,Salary Currency,Company Location,Salary in USD,Employee Residence,Company Size,Year,is_expatriate
0,Other,Full-Time,Senior,Expert,60000,Euro,Germany,64781,Germany,Large,2023,No
1,Data Engineer,Full-Time,Mid,Intermediate,160000,United States Dollar,United States,160000,United States,Medium,2023,No
2,Data Engineer,Full-Time,Mid,Intermediate,140000,United States Dollar,United States,140000,United States,Medium,2023,No
3,Data Engineer,Full-Time,Mid,Intermediate,139152,United States Dollar,United States,139152,United States,Large,2023,No
4,Data Engineer,Full-Time,Mid,Intermediate,82452,United States Dollar,United States,82452,United States,Large,2023,No
5,Other,Full-Time,Senior,Expert,204500,United States Dollar,United States,204500,United States,Medium,2023,No
6,Other,Full-Time,Senior,Expert,142200,United States Dollar,United States,142200,United States,Medium,2023,No
7,Other,Full-Time,Mid,Intermediate,85000,United States Dollar,United States,85000,United States,Medium,2023,No
8,Other,Full-Time,Mid,Intermediate,65000,United States Dollar,United States,65000,United States,Medium,2023,No
9,Data Analyst,Full-Time,Senior,Expert,139000,United States Dollar,United States,139000,United States,Medium,2023,No


In [None]:
df = df[df['Employment Type'] == 'Full-Time']
df = df.drop('Employment Type', axis=1)

In [None]:
## prepare data

X = df.drop(['Salary', 'Salary Currency','Salary in USD'], axis=1)
y = df['Salary in USD']

In [None]:
X

Unnamed: 0,Job Title,Experience Level,Expertise Level,Company Location,Employee Residence,Company Size,Year,is_expatriate
0,Other,Senior,Expert,Germany,Germany,Large,2023,No
1,Data Engineer,Mid,Intermediate,United States,United States,Medium,2023,No
2,Data Engineer,Mid,Intermediate,United States,United States,Medium,2023,No
3,Data Engineer,Mid,Intermediate,United States,United States,Large,2023,No
4,Data Engineer,Mid,Intermediate,United States,United States,Large,2023,No
...,...,...,...,...,...,...,...,...
3677,Other,Senior,Expert,United States,United States,Large,2021,No
3678,Data Scientist,Senior,Expert,United States,United States,Large,2020,No
3679,Other,Mid,Intermediate,United States,United States,Large,2021,No
3680,Data Scientist,Entry,Junior,United States,United States,Small,2020,No


In [None]:
y

0        64781
1       160000
2       140000
3       139152
4        82452
         ...  
3677    165000
3678    412000
3679    151000
3680    105000
3682     94665
Name: Salary in USD, Length: 3641, dtype: int64

In [None]:
X['Year'].value_counts()

2023    2363
2022    1005
2021     205
2020      68
Name: Year, dtype: int64

In [None]:
expertise_cat = ['Junior', 'Intermediate', 'Expert', 'Director']
X['Expertise Level'] = pd.Categorical(X['Expertise Level'], categories=expertise_cat, ordered=True)

In [None]:
experience_cat = ['Entry', 'Executive', 'Mid', 'Senior']
X['Experience Level'] = pd.Categorical(X['Experience Level'], categories=experience_cat, ordered=True)

In [None]:
company_cat = ['Small', 'Medium', 'Large']
X['Company Size'] = pd.Categorical(X['Company Size'], categories=company_cat, ordered=True)

In [None]:
year_cat = [2020, 2021, 2022, 2023]
X['Year'] = pd.Categorical(X['Year'], categories=year_cat, ordered=True)

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3641 entries, 0 to 3682
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Job Title           3641 non-null   object  
 1   Experience Level    3641 non-null   category
 2   Expertise Level     3641 non-null   category
 3   Company Location    3641 non-null   object  
 4   Employee Residence  3641 non-null   object  
 5   Company Size        3641 non-null   category
 6   Year                3641 non-null   category
 7   is_expatriate       3641 non-null   object  
dtypes: category(4), object(4)
memory usage: 157.2+ KB


In [None]:
X['Expertise Level'].value_counts()

Expert          2310
Intermediate     869
Junior           301
Director         161
Name: Expertise Level, dtype: int64

In [None]:
X

Unnamed: 0,Job Title,Experience Level,Expertise Level,Company Location,Employee Residence,Company Size,Year,is_expatriate
0,Other,Senior,Expert,Germany,Germany,Large,2023,No
1,Data Engineer,Mid,Intermediate,United States,United States,Medium,2023,No
2,Data Engineer,Mid,Intermediate,United States,United States,Medium,2023,No
3,Data Engineer,Mid,Intermediate,United States,United States,Large,2023,No
4,Data Engineer,Mid,Intermediate,United States,United States,Large,2023,No
...,...,...,...,...,...,...,...,...
3677,Other,Senior,Expert,United States,United States,Large,2021,No
3678,Data Scientist,Senior,Expert,United States,United States,Large,2020,No
3679,Other,Mid,Intermediate,United States,United States,Large,2021,No
3680,Data Scientist,Entry,Junior,United States,United States,Small,2020,No


In [None]:
X['Job Title'] = X['Job Title'].astype('object')

In [None]:
X['Company Location'] = X['Company Location'].astype('object')

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3641 entries, 0 to 3682
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Job Title           3641 non-null   object  
 1   Experience Level    3641 non-null   category
 2   Expertise Level     3641 non-null   category
 3   Company Location    3641 non-null   object  
 4   Employee Residence  3641 non-null   object  
 5   Company Size        3641 non-null   category
 6   Year                3641 non-null   category
 7   is_expatriate       3641 non-null   object  
dtypes: category(4), object(4)
memory usage: 157.2+ KB


In [None]:
X

Unnamed: 0,Job Title,Experience Level,Expertise Level,Company Location,Employee Residence,Company Size,Year,is_expatriate
0,Other,Senior,Expert,Germany,Germany,Large,2023,No
1,Data Engineer,Mid,Intermediate,United States,United States,Medium,2023,No
2,Data Engineer,Mid,Intermediate,United States,United States,Medium,2023,No
3,Data Engineer,Mid,Intermediate,United States,United States,Large,2023,No
4,Data Engineer,Mid,Intermediate,United States,United States,Large,2023,No
...,...,...,...,...,...,...,...,...
3677,Other,Senior,Expert,United States,United States,Large,2021,No
3678,Data Scientist,Senior,Expert,United States,United States,Large,2020,No
3679,Other,Mid,Intermediate,United States,United States,Large,2021,No
3680,Data Scientist,Entry,Junior,United States,United States,Small,2020,No


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

column_trans = make_column_transformer(
    (OneHotEncoder(drop='first'), ['Job Title', 'Experience Level',
       'Expertise Level', 'Company Location','Company Location', 'Year']),
#     (OrdinalEncoder(categories=[exp_cat]), ['experience_level']),
    remainder='drop')

std_scale = StandardScaler(with_mean=False)

preprocess_pipe = make_pipeline(column_trans, std_scale)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_lin = column_trans.fit_transform(X)
X_train_lin = column_trans.fit_transform(X_train)
X_test_lin = column_trans.transform(X_test)

# models
lin_reg = LinearRegression()
ridge = Ridge(alpha=0.01, max_iter=10000)



In [None]:
# Value Scaling (Normalization)
X_scale = preprocess_pipe.fit_transform(X)
y_scale = std_scale.fit_transform(y.values.reshape(-1,1))

# X_train data pipeline (OneHotEncode + StdScale)
X_train_scale = preprocess_pipe.fit_transform(X_train)
X_test_scale = preprocess_pipe.transform(X_test)

# y_train data pipeline (OneHotEncode + StdScale)
y_train_scale = std_scale.fit_transform(y_train.values.reshape(-1,1))
y_test_scale = std_scale.transform(y_test.values.reshape(-1,1))

In [None]:
def evaluate(model, train_features, train_labels, test_features, test_labels, **kwargs):
    train_pred = model.predict(train_features)
    test_pred = model.predict(test_features)
    r2_train = r2_score(train_labels, train_pred)
    r2_test = r2_score(test_labels, test_pred)
    print(f'{model} Model Performance')
    print('R2 Score of Training = {:0.6f}.'.format(r2_train))
    print('R2 Score of Validation = {:0.6f}.'.format(r2_test))
    if kwargs.get('X') is not None:
        r2_cv = cross_val_score(model, X=kwargs.get('X'), y=kwargs.get('y'), cv=5, scoring='r2').mean()
        print('5-folds CV Score = {:0.6f}.'.format(r2_cv))
    return r2_train, r2_test, r2_cv

In [None]:
# To not sacrifice interpretability, we will use the unscaled y (target) variable here
lin_reg.fit(X_train_lin, y_train)
evaluate(lin_reg, X_train_lin, y_train, X_test_lin, y_test, X=X_lin, y=y)

print('------------------------------------------------------')

ridge.fit(X_train_scale, y_train)
evaluate(ridge, X_train_scale, y_train, X_test_scale, y_test, X=X_scale, y=y)
print('')

LinearRegression() Model Performance
R2 Score of Training = 0.376354.
R2 Score of Validation = 0.355072.
5-folds CV Score = 0.325857.
------------------------------------------------------
Ridge(alpha=0.01, max_iter=10000) Model Performance
R2 Score of Training = 0.376354.
R2 Score of Validation = 0.355070.
5-folds CV Score = 0.325862.



In [None]:
lasso.fit(X_train_scale, y_train)
evaluate(lasso, X_train_scale, y_train, X_test_scale, y_test, X=X_scale, y=y)

Lasso(alpha=100, max_iter=10000) Model Performance
R2 Score of Training = 0.376295.
R2 Score of Validation = 0.355167.
5-folds CV Score = 0.327187.


(0.3762947554187227, 0.3551671884870401, 0.3271872476775467)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Create the random grid for RandomForest
random_grid_rf = {'n_estimators': [int(x) for x in np.linspace(500, 1000, 50)],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [int(x) for x in np.linspace(1, 20, 20)],
               'min_samples_split': [2, 3, 4, 5, 6],
               'min_samples_leaf': [2, 4, 6, 8, 10]}


# Create the random grid for GradientBoosting
random_grid_gb = {'n_estimators': [int(x) for x in np.linspace(1, 100, 20)],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [int(x) for x in np.linspace(1, 10, 10)],
               'min_samples_split': [int(x) for x in np.linspace(3, 300, 30)],
               'min_samples_leaf': [int(x) for x in np.linspace(3, 30, 30)],
               'learning_rate':[0.01, 0.1],
               'alpha': [0.3, 0.6, 0.9]}

# Construct base models
rf_base = RandomForestRegressor(random_state = 42)
gb_base = GradientBoostingRegressor(random_state = 42)

rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = random_grid_rf, n_iter = 500, cv = 3, random_state=42, n_jobs = -1)
gb_random = RandomizedSearchCV(estimator = gb_base, param_distributions = random_grid_gb, n_iter = 500, cv = 3, random_state=42, n_jobs = -1)

# Models optimized using BestForest/BestGB method (empirical visualizations)
rf = RandomForestRegressor(max_features='sqrt', min_samples_leaf=3, random_state = 42)
gb = GradientBoostingRegressor(n_estimators=90, max_features='log2', max_depth=4, min_samples_leaf=10, min_samples_split=220, random_state = 42)


In [None]:
rf_base.fit(X_train_scale, y_train_scale.ravel())
RF_Base = evaluate(rf_base, X_train_scale, y_train_scale, X_test_scale, y_test_scale, X=X_scale, y=y)
gb_base.fit(X_train_scale, y_train_scale.ravel())
GB_Base = evaluate(gb_base, X_train_scale, y_train_scale, X_test_scale, y_test_scale, X=X_scale, y=y)


rf.fit(X_train_scale, y_train_scale.ravel())
RF_Tuned = evaluate(rf, X_train_scale, y_train_scale, X_test_scale, y_test_scale, X=X_scale, y=y)
gb.fit(X_train_scale, y_train_scale.ravel())
GB_Tuned = evaluate(gb, X_train_scale, y_train_scale, X_test_scale, y_test_scale, X=X_scale, y=y)
print('')

RandomForestRegressor(random_state=42) Model Performance
R2 Score of Training = 0.445110.
R2 Score of Validation = 0.334131.
5-folds CV Score = 0.268193.
GradientBoostingRegressor(random_state=42) Model Performance
R2 Score of Training = 0.405109.
R2 Score of Validation = 0.346506.
5-folds CV Score = 0.319491.
RandomForestRegressor(max_features='sqrt', min_samples_leaf=3, random_state=42) Model Performance
R2 Score of Training = 0.409057.
R2 Score of Validation = 0.348809.
5-folds CV Score = 0.314649.
GradientBoostingRegressor(max_depth=4, max_features='log2', min_samples_leaf=10,
                          min_samples_split=220, n_estimators=90,
                          random_state=42) Model Performance
R2 Score of Training = 0.390687.
R2 Score of Validation = 0.359009.
5-folds CV Score = 0.333254.



In [None]:
rf_random.fit(X_train_scale, y_train_scale.ravel())
gb_random.fit(X_train_scale, y_train_scale.ravel())
print('')




In [None]:
best_params = rf_random.best_params_
best_params

{'n_estimators': 500,
 'min_samples_split': 3,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 14}

In [None]:
best_params2 = gb_random.best_params_
best_params2

{'n_estimators': 84,
 'min_samples_split': 259,
 'min_samples_leaf': 12,
 'max_features': 'log2',
 'max_depth': 4,
 'learning_rate': 0.1,
 'alpha': 0.9}

In [None]:
# Create a new Random Forest model using the best parameters
best_rf_model = RandomForestRegressor(**best_params)

# Fit the best model on the scaled training data
best_rf_model.fit(X_train_scale, y_train_scale)

# Predict on the scaled test data
y_pred = best_rf_model.predict(X_test_scale)

# Calculate R-squared
r2 = r2_score(y_test_scale, y_pred)
print(f"R-squared: {r2}")

  best_rf_model.fit(X_train_scale, y_train_scale)


R-squared: 0.34940075248866365


In [None]:
# Create a new Random Forest model using the best parameters
best_gb_model = GradientBoostingRegressor(**best_params)

# Fit the best model on the scaled training data
best_gb_model.fit(X_train_scale, y_train_scale)

# Predict on the scaled test data
y_pred2 = best_gb_model.predict(X_test_scale)

# Calculate R-squared
r2 = r2_score(y_test_scale, y_pred2)
print(f"R-squared: {r2}")

  y = column_or_1d(y, warn=True)


R-squared: 0.3192670700169751


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor

In [None]:
lasso = Lasso(alpha=100, max_iter=10000)

knr = KNeighborsRegressor(n_neighbors=15, metric='euclidean')

lasso.fit(X_train_scale, y_train)
knr.fit(X_train_scale, y_train_scale.ravel())


In [None]:
vot_reg = VotingRegressor(estimators=[
    ('rf', rf),
    ('gb', gb),
    ('lasso', lasso),
    ('kn', knr)])

vot_reg = vot_reg.fit(X_train_scale, y_train_scale.ravel())

In [None]:
voting_accuracy = evaluate(vot_reg, X_train_scale, y_train_scale.ravel(), X_test_scale, y_test_scale.ravel(), X=X_scale, y=y)

VotingRegressor(estimators=[('rf',
                             RandomForestRegressor(max_features='sqrt',
                                                   min_samples_leaf=3,
                                                   random_state=42)),
                            ('gb',
                             GradientBoostingRegressor(max_depth=4,
                                                       max_features='log2',
                                                       min_samples_leaf=10,
                                                       min_samples_split=220,
                                                       n_estimators=90,
                                                       random_state=42)),
                            ('lasso', Lasso(alpha=100, max_iter=10000)),
                            ('kn',
                             KNeighborsRegressor(metric='euclidean',
                                                 n_neighbors=15))]) Model Performance
R2 Score o

In [None]:
# StackingRegressor
estimators = [('rf', rf),
              ('gb', gb),
              ('lasso', lasso),
              ('knr', knr)]

final_estimator = lin_reg
stk_reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator)
stk_reg.fit(X_train_scale, y_train_scale.ravel())
print('')




In [None]:
stk_accuracy = evaluate(stk_reg, X_train_scale, y_train_scale.ravel(), X_test_scale, y_test_scale.ravel(), X=X_scale, y=y)

StackingRegressor(estimators=[('rf',
                               RandomForestRegressor(max_features='sqrt',
                                                     min_samples_leaf=3,
                                                     random_state=42)),
                              ('gb',
                               GradientBoostingRegressor(max_depth=4,
                                                         max_features='log2',
                                                         min_samples_leaf=10,
                                                         min_samples_split=220,
                                                         n_estimators=90,
                                                         random_state=42)),
                              ('lasso', Lasso(alpha=100, max_iter=10000)),
                              ('knr',
                               KNeighborsRegressor(metric='euclidean',
                                                   n_neighbors=15))

In [None]:
pip install pandas scikit-learn tensorflow

In [None]:
X_test

Unnamed: 0,Job Title,Experience Level,Expertise Level,Company Location,Salary in USD,Employee Residence,Company Size,Year,is_expatriate
418,2,2,2,5,96750,United States,Medium,2023,No
2949,4,3,1,5,178800,United States,Large,2022,No
1855,8,3,1,5,181000,United States,Medium,2023,No
301,2,3,1,5,127876,United States,Medium,2023,No
1072,9,3,1,5,129000,United States,Medium,2023,No
...,...,...,...,...,...,...,...,...,...
346,8,3,1,5,267720,United States,Medium,2023,No
1594,5,2,2,5,95000,United States,Medium,2023,No
3179,9,3,1,5,221300,United States,Large,2022,No
681,6,3,1,3,71259,Spain,Medium,2023,No


In [None]:
# Initialize LabelEncoder

from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in X_train.columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

In [None]:
X_train

Unnamed: 0,Job Title,Experience Level,Expertise Level,Company Location,Employee Residence,Company Size,Year,is_expatriate
2878,4,3,1,5,6,0,2,0
3007,0,3,1,5,6,1,2,0
2300,9,3,1,5,6,1,3,0
1405,3,3,1,4,5,1,3,0
3424,7,2,2,2,3,2,1,0
...,...,...,...,...,...,...,...,...
1134,11,3,1,5,6,1,3,0
1301,4,3,1,5,6,1,3,0
863,4,3,1,5,6,1,3,0
3541,2,2,2,5,6,0,1,0


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, concatenate
from tensorflow.keras.models import Model

# Define the input layers
input_layers = []
encoded_layers = []

for col in X_train.columns:
    input_layer = Input(shape=(1,), name=col)
    input_layers.append(input_layer)

    num_categories = X_train[col].nunique()
    embed_size = 4  # You can adjust this as needed
    embedding_layer = Embedding(num_categories, embed_size)(input_layer)
    encoded_layers.append(Flatten()(embedding_layer))

# Concatenate all encoded layers
merged_layer = concatenate(encoded_layers)

# Define the model
model = Model(inputs=input_layers, outputs=merged_layer)


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='mse')  # Choose an appropriate loss function

# Convert categorical columns to lists of arrays
X_train_encoded = [X_train[col].values for col in X_train.columns]

# Fit the model
model.fit(X_train_encoded, y_train, epochs=10, batch_size=32)  # Adjust epochs and batch_size as needed


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d2a57ba7460>

In [None]:
# Convert categorical columns of test data to lists of arrays
X_test_encoded = [X_test[col].values for col in X_test.columns]

# Get the transformed features from the embedding model
X_train_embedded = model.predict(X_train_encoded)
X_test_embedded = model.predict(X_test_encoded)

# Now you can use X_train_embedded and X_test_embedded for your machine learning algorithms




In [None]:
X_train_encoded

[array([4, 0, 9, ..., 4, 2, 9]),
 array([3, 3, 3, ..., 3, 2, 2]),
 array([1, 1, 1, ..., 1, 2, 2]),
 array([5, 5, 5, ..., 5, 5, 5]),
 array([6, 6, 6, ..., 6, 6, 6]),
 array([0, 1, 1, ..., 1, 0, 1]),
 array([2, 2, 3, ..., 3, 1, 2]),
 array([0, 0, 0, ..., 0, 0, 0])]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Initialize and train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust parameters as needed
rf_model.fit(X_train_encoded, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test_encoded)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3641 entries, 0 to 3682
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Title           3641 non-null   object
 1   Experience Level    3641 non-null   object
 2   Expertise Level     3641 non-null   object
 3   Salary              3641 non-null   int64 
 4   Salary Currency     3641 non-null   object
 5   Company Location    3641 non-null   object
 6   Salary in USD       3641 non-null   int64 
 7   Employee Residence  3641 non-null   object
 8   Company Size        3641 non-null   object
 9   Year                3641 non-null   int64 
 10  is_expatriate       3641 non-null   object
dtypes: int64(3), object(8)
memory usage: 341.3+ KB


In [None]:
### ----- Auto Encoder ----- ####
#Ref - https://towardsdatascience.com/introduction-to-autoencoders-7a47cf4ef14b

In [None]:
### ANN

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import load_model


# AutoEncoder Model Preparation
n_inputs = X_train_scale.shape[1]
# define encoder
input_data_shape= Input(shape=(n_inputs,))
# encoder level 1
encoder= Dense(n_inputs*2)(input_data_shape)
encoder = BatchNormalization()(encoder)
encoder= LeakyReLU()(encoder)
# encoder level 2
encoder= Dense(n_inputs)(encoder)
encoder= BatchNormalization()(encoder)
encoder= LeakyReLU()(encoder)
# bottleneck
n_bottleneck = round(float(n_inputs) / 2.0)
bottleneck = Dense(n_bottleneck)(encoder)
# define decoder, level 1
decoder = Dense(n_inputs)(bottleneck)
decoder = BatchNormalization()(decoder)
decoder = LeakyReLU()(decoder)
# decoder level 2
decoder = Dense(n_inputs*2)(decoder)
decoder = BatchNormalization()(decoder)
decoder = LeakyReLU()(decoder)

In [None]:
# output layer
output = Dense(n_inputs, activation='linear')(decoder)
# define autoencoder model
model = Model(inputs=input_data_shape, outputs=output)
# compile autoencoder model
model.compile(optimizer='adam', loss='mse')

In [None]:
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 30)]              0         
                                                                 
 dense_12 (Dense)            (None, 60)                1860      
                                                                 
 batch_normalization_8 (Bat  (None, 60)                240       
 chNormalization)                                                
                                                                 
 leaky_re_lu_8 (LeakyReLU)   (None, 60)                0         
                                                                 
 dense_13 (Dense)            (None, 30)                1830      
                                                                 
 batch_normalization_9 (Bat  (None, 30)                120       
 chNormalization)                                          

In [None]:
# fit the autoencoder model to reconstruct input
history = model.fit(X_train_scale, y_train_scale, epochs=50, batch_size=16, verbose=2, validation_data=(X_test_scale,y_test_scale))

Epoch 1/50
182/182 - 6s - loss: 1.9912 - val_loss: 0.8958 - 6s/epoch - 33ms/step
Epoch 2/50
182/182 - 1s - loss: 0.7508 - val_loss: 0.6703 - 973ms/epoch - 5ms/step
Epoch 3/50
182/182 - 1s - loss: 0.7097 - val_loss: 0.6346 - 1s/epoch - 6ms/step
Epoch 4/50
182/182 - 1s - loss: 0.7003 - val_loss: 0.6375 - 1s/epoch - 6ms/step
Epoch 5/50
182/182 - 1s - loss: 0.6856 - val_loss: 0.6264 - 1s/epoch - 6ms/step
Epoch 6/50
182/182 - 1s - loss: 0.6885 - val_loss: 0.6231 - 754ms/epoch - 4ms/step
Epoch 7/50
182/182 - 1s - loss: 0.6811 - val_loss: 0.6338 - 641ms/epoch - 4ms/step
Epoch 8/50
182/182 - 1s - loss: 0.6790 - val_loss: 0.6107 - 600ms/epoch - 3ms/step
Epoch 9/50
182/182 - 1s - loss: 0.6781 - val_loss: 0.6246 - 601ms/epoch - 3ms/step
Epoch 10/50
182/182 - 1s - loss: 0.6667 - val_loss: 0.6104 - 589ms/epoch - 3ms/step
Epoch 11/50
182/182 - 1s - loss: 0.6614 - val_loss: 0.6064 - 664ms/epoch - 4ms/step
Epoch 12/50
182/182 - 1s - loss: 0.6620 - val_loss: 0.6130 - 659ms/epoch - 4ms/step
Epoch 13/50


In [None]:
# define an encoder model (without the decoder)
encoder = Model(inputs=input_data_shape, outputs=bottleneck)
# save the encoder to file
encoder.save('encoder.h5')

  saving_api.save_model(


In [None]:
# load the model from file
encoder = load_model('encoder.h5')

# encode the train data
X_train_encode = encoder.predict(X_train_scale)
# encode the test data
X_test_encode = encoder.predict(X_test_scale)





In [None]:

# importing the libraries
from keras.models import Sequential
from keras.layers import Dense

# create ANN model
model = Sequential()

# Defining the Input layer and FIRST hidden layer, both are same!
model.add(Dense(units=5, input_dim=15, kernel_initializer='normal', activation='relu'))

# Defining the Second layer of the model
# after the first layer we don't have to specify input_dim as keras configure it automatically
model.add(Dense(units=5, kernel_initializer='normal', activation='tanh'))

# The output neuron is a single fully connected node
# Since we will be predicting a single number
model.add(Dense(1, kernel_initializer='normal'))

# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Fitting the ANN to the Training set
model.fit(X_train_encode, y_train_scale ,batch_size = 20, epochs = 50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7d29b5b8c9d0>

In [None]:
y_pred=model.predict(X_test_encode)



In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy using categorical labels
r2 = r2_score(y_test_scale, y_pred)
r2

0.3480330614846757