#### Essential Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

#### Load the DataFrame

In [2]:
df = pd.read_csv("/kaggle/input/salary-by-job-title-and-country/Salary.csv")
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0
...,...,...,...,...,...,...,...,...,...
6679,49.0,Female,3,Director of Marketing,20.0,200000.0,UK,Mixed,0
6680,32.0,Male,0,Sales Associate,3.0,50000.0,Australia,Australian,0
6681,30.0,Female,1,Financial Manager,4.0,55000.0,China,Chinese,0
6682,46.0,Male,2,Marketing Manager,14.0,140000.0,China,Korean,0


In [3]:
df.describe()

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Senior
count,6684.0,6684.0,6684.0,6684.0,6684.0
mean,33.610563,1.622382,8.077723,115307.175194,0.143477
std,7.595994,0.880474,6.029305,52806.810881,0.350585
min,21.0,0.0,0.0,350.0,0.0
25%,28.0,1.0,3.0,70000.0,0.0
50%,32.0,1.0,7.0,115000.0,0.0
75%,38.0,2.0,12.0,160000.0,0.0
max,62.0,3.0,34.0,250000.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6684 entries, 0 to 6683
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6684 non-null   float64
 1   Gender               6684 non-null   object 
 2   Education Level      6684 non-null   int64  
 3   Job Title            6684 non-null   object 
 4   Years of Experience  6684 non-null   float64
 5   Salary               6684 non-null   float64
 6   Country              6684 non-null   object 
 7   Race                 6684 non-null   object 
 8   Senior               6684 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 470.1+ KB


In [5]:
df.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
Country                0
Race                   0
Senior                 0
dtype: int64

#### Explore the Uniqueness of the DataFrame

In [6]:
df['Race'].unique()

array(['White', 'Hispanic', 'Asian', 'Korean', 'Chinese', 'Australian',
       'Welsh', 'African American', 'Mixed', 'Black'], dtype=object)

In [7]:
df['Job Title'].unique()

array(['Software Engineer', 'Data Analyst', 'Manager', 'Sales Associate',
       'Director', 'Marketing Analyst', 'Product Manager',
       'Sales Manager', 'Marketing Coordinator', 'Scientist',
       'Software Developer', 'HR Manager', 'Financial Analyst',
       'Project Manager', 'Customer Service Rep', 'Operations Manager',
       'Marketing Manager', 'Engineer', 'Data Entry Clerk',
       'Sales Director', 'Business Analyst', 'VP of Operations',
       'IT Support', 'Recruiter', 'Financial Manager',
       'Social Media Specialist', 'Software Manager', 'Developer',
       'Consultant', 'Product Designer', 'CEO', 'Accountant',
       'Data Scientist', 'Marketing Specialist', 'Technical Writer',
       'HR Generalist', 'Project Engineer', 'Customer Success Rep',
       'Sales Executive', 'UX Designer', 'Operations Director',
       'Network Engineer', 'Administrative Assistant',
       'Strategy Consultant', 'Copywriter', 'Account Manager',
       'Director of Marketing', 'Help Des

In [8]:
df['Country'].unique()

array(['UK', 'USA', 'Canada', 'China', 'Australia'], dtype=object)

In [9]:
df['Salary'].unique()

array([ 90000.,  65000., 150000.,  60000., 200000.,  55000., 120000.,
        80000.,  45000., 110000.,  75000., 140000., 130000.,  40000.,
       125000., 115000.,  35000., 180000., 190000.,  50000., 250000.,
       170000., 160000.,  85000.,  95000., 105000.,  70000., 100000.,
        30000., 135000., 220000., 175000., 185000., 145000., 155000.,
          350., 195000., 198000., 196000., 193000.,  92000., 165000.,
       162000., 197000., 142000., 182000., 210000.,    550., 122485.,
       169159., 187081.,  78354.,  90249., 132720., 161568., 127346.,
       120177., 101332., 121450., 166375., 185119., 149217., 166512.,
       186963.,  75072., 163398., 103947., 179180., 175966., 190004.,
       152039.,  76742., 191790., 139398.,  95845., 160976., 126753.,
       139817., 181714., 114776., 105725.,  52731., 106492.,  73895.,
       119836.,  99747., 168287., 115920., 128078.,  51265., 165919.,
       188651.,  55538., 193964., 104702., 172955., 138032.,  82683.,
       155414., 1542

In [10]:
# Define industry/sector keywords
industry_keywords = {
    'Information Technology': ['Software', 'Developer', 'Engineer', 'Manager', 'Architect', 'Consultant'],
    'Business': ['Analyst', 'Manager', 'Consultant', 'Development', 'Coordinator', 'Specialist'],
    'Sales': ['Sales', 'Account', 'Representative', 'Director', 'Operations'],
    'Marketing': ['Marketing', 'Coordinator', 'Specialist', 'Director', 'Content', 'Copywriter'],
    'HR and People': ['HR', 'Human Resources', 'Recruiter', 'Coordinator', 'Specialist'],
    'Finance': ['Financial', 'Accountant', 'Advisor'],
    'Supply Chain and Logistics': ['Supply Chain', 'Logistics', 'Operations', 'Coordinator'],
    'Science and Research': ['Scientist', 'Researcher', 'Data Science'],
    'Customer Service': ['Customer Service', 'Receptionist'],
    'Delivery and Transportation': ['Delivery', 'Driver']
}

In [11]:
# Function to classify job titles
def classify_job_title(title):
    for industry, keywords in industry_keywords.items():
        for keyword in keywords:
            if keyword.lower() in title.lower():
                return industry
    return 'Other'

In [12]:
df['Industry/Sector'] = df['Job Title'].apply(classify_job_title)
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior,Industry/Sector
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0,Information Technology
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0,Business
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1,Information Technology
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0,Sales
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0,Sales
...,...,...,...,...,...,...,...,...,...,...
6679,49.0,Female,3,Director of Marketing,20.0,200000.0,UK,Mixed,0,Sales
6680,32.0,Male,0,Sales Associate,3.0,50000.0,Australia,Australian,0,Sales
6681,30.0,Female,1,Financial Manager,4.0,55000.0,China,Chinese,0,Information Technology
6682,46.0,Male,2,Marketing Manager,14.0,140000.0,China,Korean,0,Information Technology


#### Visualise the Whole DataFrame

In [13]:
import plotly.express as px

# Assuming you have a DataFrame named df
fig = px.histogram(df, x='Age', width=800, height=600, color_discrete_sequence=['skyblue'])
fig.update_layout(title="Age Distribution")
fig.show()


In [14]:
# Assuming you have a DataFrame named df
fig2 = px.scatter(df, x='Race', y='Salary', color='Race', title='Race vs. Salary')
fig2.update_xaxes(categoryorder='total ascending')
fig2.show()

In [15]:
fig3 = px.histogram(df, x='Salary', width=800, height=600, color_discrete_sequence=['skyblue'])
fig3.update_layout(title="Salary Distribution")
fig3.show()

In [16]:
fig = px.box(df, y='Salary', title='Salary Distribution and Outliers')
fig.show()

In [17]:
fig = px.choropleth(df,
                    locations='Country',  # Column with country codes or names
                    color='Salary',        # Column with values to be visualized
                    locationmode='country names',  # Use country names
                    color_continuous_scale='icefire',  # Choose a color scale
                    title='Choropleth Map Using the DataFrame Above')

fig.show()

In [18]:
# Finding the highest paying job title
highest_paying_job_title = df.loc[df['Salary'].idxmax(), 'Job Title']

# Find the corresponding salary
highest_salary = df.loc[df['Salary'].idxmax(), 'Salary']

# Find the corresponding Country
highest_country = df.loc[df['Salary'].idxmax(), 'Country']

print("Highest Paying Job Title:", highest_paying_job_title)
print("Salary for the Highest Paying Job Title:", highest_salary)
print("Country with the highest paying Job Title:", highest_country)

Highest Paying Job Title: CEO
Salary for the Highest Paying Job Title: 250000.0
Country with the highest paying Job Title: Canada


#### Using Software Engineer as an example to Visualise if Need to highlight a particular job role

In [19]:
# Filter the DataFrame to only include 'Software Engineer' job titles
software_engineer_df = df[df['Job Title'] == 'Software Engineer'] # this code here can be replicated to

# Create a scatter plot to visualize the relationship between 'Age' and 'Salary'
fig = px.bar(software_engineer_df, x='Age', y='Salary', color='Education Level',
             title='Age vs. Salary for Software Engineers Filtered by Education Levels')
fig.show()

In [20]:
software_engineer_df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior,Industry/Sector
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0,Information Technology
77,32.0,Male,2,Software Engineer,6.0,100000.0,USA,African American,1,Information Technology
159,28.0,Male,1,Software Engineer,2.0,40000.0,Australia,White,0,Information Technology
169,34.0,Male,2,Software Engineer,9.0,105000.0,China,Korean,1,Information Technology
213,44.0,Male,1,Software Engineer,14.0,130000.0,China,Chinese,1,Information Technology
...,...,...,...,...,...,...,...,...,...,...
6172,27.0,Male,1,Software Engineer,3.0,80000.0,USA,Asian,0,Information Technology
6186,27.0,Male,1,Software Engineer,3.0,80000.0,China,Korean,0,Information Technology
6200,27.0,Male,1,Software Engineer,3.0,80000.0,Australia,White,0,Information Technology
6214,27.0,Male,1,Software Engineer,3.0,80000.0,Australia,Asian,0,Information Technology


In [21]:
fig = px.bar(software_engineer_df, x='Age', y='Salary', color='Race',
             title='Age vs. Salary for Software Engineers Filtered by Race')
fig.show()

In [22]:
fig = px.scatter(software_engineer_df, x='Age', y='Salary', color='Gender',
             title='Age vs. Salary for Software Engineers Filtered by Gender')
fig.show()

In [23]:
fig = px.scatter(software_engineer_df, x="Years of Experience", y="Salary", color = 'Age',
                 title='Years of Experience vs. Salary for Software Engineers')
fig.show()

In [24]:
fig = px.scatter(software_engineer_df, x="Country", y="Salary", color = 'Age',
                 title='Country vs. Salary for Software Engineers')
fig.show()

In [25]:
fig = px.scatter_3d(software_engineer_df, x='Salary', y='Country', z='Race', color= 'Salary',
                    title='3D Scatter Plot of Salary vs Country & Race for Software Engineers ')
fig.show()

#### Machine Learning

In [26]:
df1 = df.copy()

In [27]:
# Categorical boolean mask
categorical_feature_mask = df1.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = df1.columns[categorical_feature_mask].tolist()

In [28]:
# import labelencoder
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()

In [29]:
# apply le on categorical feature columns
df1[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))
df1[categorical_cols].head(10)

Unnamed: 0,Gender,Job Title,Country,Race,Industry/Sector
0,1,112,3,9,5
1,0,24,4,5,0
2,1,72,1,9,5
3,0,100,4,5,8
4,1,34,4,1,8
5,1,73,4,5,0
6,0,88,4,1,5
7,1,103,2,6,5
8,0,74,2,4,0
9,1,106,0,2,9


In [30]:
df1

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior,Industry/Sector
0,32.0,1,1,112,5.0,90000.0,3,9,0,5
1,28.0,0,2,24,3.0,65000.0,4,5,0,0
2,45.0,1,3,72,15.0,150000.0,1,9,1,5
3,36.0,0,1,100,7.0,60000.0,4,5,0,8
4,52.0,1,2,34,20.0,200000.0,4,1,0,8
...,...,...,...,...,...,...,...,...,...,...
6679,49.0,0,3,42,20.0,200000.0,3,7,0,8
6680,32.0,1,0,100,3.0,50000.0,0,2,0,8
6681,30.0,0,1,51,4.0,55000.0,2,4,0,5
6682,46.0,1,2,76,14.0,140000.0,2,6,0,5


In [31]:
# Feature selection - choose the relevant features
features = ['Age', 'Gender', 'Education Level', 'Years of Experience', 'Country', 'Race', 'Senior', 'Industry/Sector'] # here you can pick whatever you want as your feature to explore



X = df1[features]
y = df1['Salary']
# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() # remember this. the goal of standardization is to have all the values of the features in the same range. -3 & +3
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [33]:
pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12
Note: you may need to restart the kernel to use updated packages.


In [34]:
from lazypredict.Supervised import LazyRegressor

In [35]:
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = reg.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 42/42 [00:38<00:00,  1.10it/s]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,0.93,0.93,13799.84,0.59
RandomForestRegressor,0.93,0.93,14361.65,1.08
ExtraTreesRegressor,0.92,0.92,14602.62,0.86
BaggingRegressor,0.92,0.92,14769.35,0.12
LGBMRegressor,0.92,0.92,15197.09,0.73
HistGradientBoostingRegressor,0.92,0.92,15300.89,0.49
DecisionTreeRegressor,0.9,0.9,16781.16,0.11
ExtraTreeRegressor,0.9,0.9,16887.9,0.1
GradientBoostingRegressor,0.87,0.87,18777.62,0.39
KNeighborsRegressor,0.83,0.83,21770.94,0.12


Based on LazyRegressor the best Models to pick are:
* GBRegressor
* RandomForestRegressor
* BaggingRegressor
* HistGradientBoostingRegressor
* LGBMRegressor

In [36]:
from xgboost import XGBRegressor
classifier = XGBRegressor()
classifier.fit(X_train, y_train)

In [37]:
y_pred = classifier.predict(X_test)

In [38]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R2) Score:", r2)

Mean Squared Error: 190435694.68237084
Root Mean Squared Error: 13799.844009349194
R-squared (R2) Score: 0.93147111687022


In [39]:
# Interpreting model results and feature importance
feature_importance = pd.Series(classifier.feature_importances_, index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)
print("Feature Importance:")
print(feature_importance)

Feature Importance:
Years of Experience   0.67
Industry/Sector       0.11
Education Level       0.08
Age                   0.05
Senior                0.03
Gender                0.03
Race                  0.01
Country               0.01
dtype: float32


Above we see that what affect **Salary** the most are **Years of Experience**, **Industry/Sector**