In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading the data
df = pd.read_csv('Salary_Data_Based_country_and_race.csv')
df.head()

In [None]:
### Data Preprocessing
# Checking the shape of the data
df.shape

In [None]:
# Checking for null/missing values
df.isnull().sum()

In [None]:
## The number of rows with null/missing values is less compared to the total number of rows.
## Need to drop these rows.
df.dropna(axis=0, inplace=True)

In [None]:
# Checking for null values
df.isnull().sum()

In [None]:
## Dropping Unnamed Column beacuse it is just an index column
# Dropping column
df.drop(columns = 'Unnamed: 0',axis=1,inplace=True)

In [None]:
# Checking data type of each column
df.dtypes

In [None]:
## Checking for unique values in each column
# Unique values in each column
df.nunique()

In [None]:
### The Job Title column has 191 different values. It will be very difficult to analyze so many job titles. So, group the job titles under similar job domains.
# Find the unique Job Titles
df['Job Title'].unique()

In [None]:
# Grouping Job Titles

def categorize_job_title(job_title):
    job_title = str(job_title).lower() 
    if 'software' in job_title or 'developer' in job_title:
        return 'Software/Developer'
    elif 'data' in job_title or 'analyst' in job_title or 'scientist' in job_title:
        return 'Data Analyst/Scientist'
    elif 'manager' in job_title or 'director' in job_title or 'vp' in job_title:
        return 'Manager/Director/VP'
    elif 'sales' in job_title or 'representative' in job_title:
        return 'Sales'
    elif 'marketing' in job_title or 'social media' in job_title:
        return 'Marketing/Social Media'
    elif 'product' in job_title or 'designer' in job_title:
        return 'Product/Designer'
    elif 'hr' in job_title or 'human resources' in job_title:
        return 'HR/Human Resources'
    elif 'financial' in job_title or 'accountant' in job_title:
        return 'Financial/Accountant'
    elif 'project manager' in job_title:
        return 'Project Manager'
    elif 'it' in job_title or 'support' in job_title:
        return 'IT/Technical Support'
    elif 'operations' in job_title or 'supply chain' in job_title:
        return 'Operations/Supply Chain'
    elif 'customer service' in job_title or 'receptionist' in job_title:
        return 'Customer Service/Receptionist'
    else:
        return 'Other'
df['Job Title'] = df['Job Title'].apply(categorize_job_title)

In [None]:
## Education Level in the dataset has 7 unique values

# Unique values for Education Level
df['Education Level'].unique() 

In [None]:
## In the dataset the education level is represented in two different ways.
#1. Bachelor's and Bachelor's degree, which means the same thing. Group it into Bachelors
#2. Master's and Master's Degree, grouped into Masters
#3. phD and PhD, as PhD
#4. High School

# Grouping Education Level

def group_education(Educaton):
    
    Educaton = str(Educaton).lower()
    if 'high school' in Educaton:
        return 'High School'
    elif 'bachelor\'s' in Educaton:
        return 'Bachelors'
    elif 'master\'s' in Educaton:
        return 'Masters'
    elif 'phd' in Educaton:
        return 'PhD'
df['Education Level'] = df['Education Level'].apply(group_education)   

In [None]:
### DESCRIPTIVE STATISTICS
# Descriptive statistics
df.describe()

In [None]:
df.head()

In [None]:
df['Gender'].unique()

In [None]:
### Exploratory Data Analysis
## In the EDA, look at the data and try to understand it. Begin by looking at the distribution of data across the dataset.
# Followed by visualizing the data to understand the relationship between the features and the target variable.

## Independent Var 1: Gender

# Pie chart
plt.figure(figsize=(8,8))
plt.pie(df['Gender'].value_counts(), labels=['Male','Female', 'Other'], autopct='%1.1f%%', startangle=90)
plt.title('Gender Distribution')
plt.show()

## Output: The pie chart shows that majority of the employees in the dataset are male representing 54.8 % of the dataset, followed by females by 45% and 0.2% employees that belong to other gender.

In [None]:
## Independent Var 2: Age

# Age Distribution
sns.histplot(data=df, x='Age', bins=20, kde=True)
plt.title('Age Distribution')
plt.show()

## Output: Majority of the employees are in the range of (25 - 33) years of age, meaning majority of the employees are young and energetic. 
# There is only minimal number of old employees in the dataset having age more than 55 years.


In [None]:
## Independent Var 3: Educattion 
# Education Level
sns.countplot(x = 'Education Level', data = df, palette='Set1')
plt.xticks(rotation=90)

## Output: Most of the employees have a Bachelor's degree followed by Master's degree and Doctoral degree. The least number of employees have a High School education. 
# From the graph it is clear that most of the employees started working after graduation, few of them started working after post graduation and very few of them have gone for doctorate. 
# The least number of employees have started working after high school education.

In [None]:
## Independent Var 4: Job Titles

# Job Title
sns.countplot(x='Job Title', data = df)
plt.xticks(rotation=90)

## Output: Simpler form for visualizing job titles. From the graph, it is clear that majority of the employees are - Software Developers, Data Analyst/Scientist or Manager/Director/Vp. 
# Fewer number of employees have job titles such as sales, marketing/social media, HR, Product Designer and Customer Service. Fewest employees work as Financial/accountant or operation/supply management.

# CONC: # Fewer number of employees have job titles such as sales, marketing/social media, HR, Product Designer and Customer Service. Fewest employees work as Financial/accountants or operation/supply management.

In [None]:
## Independent Var 5: YOE

# Years of Experience
sns.histplot(x = 'Years of Experience', data = df,kde=True)

## Output: Most of the employees in the dataset have experience of 1-7 years in the respective domains.
# The number of employees in the dataset decreases with increasing number of years of experience.

In [None]:
## Independent Var 6: Employees Country

# Country
sns.countplot(x='Country', data=df)
plt.xticks(rotation=90)

## Output: The number of employees from the 5 countries is nearly the same, with a little more in USA and little less in Canada.

In [None]:
## Independent Var 7: Race of employees

# Racial Distribution
sns.countplot(x='Race', data=df)
plt.xticks(rotation=90)

## Output: Visualization of the racial distribution in the dataset. From the graph, it is clear that most of the employees are either White or Asian, followed by Korean, Chinese, Australian and Black. 
# Number of employees from Welsh, African American, Mixed and Hispanic race are less as compared to other groups.

In [None]:
## From the above plots and graphs, we understand the data we are dealing with - its distribution and quantity as well. 
# Next step is to explore the relations of these independent variables with the target Variable i.e. Salary.

# 1. Age and Salary
sns.scatterplot(x = 'Age', y='Salary', data=df)
plt.title('Age Vs Salary')

## Output: The scatter plot shows a trend where the Salary of an employee increases with Age, which is obvious because of promotions and apprisals. 
# However upon closer observation we can find that similar age have multiple salaries, which means there are other factors which decides the salary.

In [None]:
# 2. Gender and Salary
fig, ax = plt.subplots(1,2, figsize = (15, 5))
sns.boxplot(x = 'Gender', y='Salary', data = df, ax =ax[0]).set_title('Gender Vs Salary')
sns.violinplot(x = 'Gender', y='Salary', data = df, ax =ax[1]).set_title('Gender Vs Salary')

## Output: The boxplot and violinplot describes the salary distribution among the three genders.
# In the boxplot the employees from Other gender has quite high salary level as compared to the Males and Females. 
# The other gender employees have a median salary above 150000, followed by males with median salary near 125000 and females with median salary slightly above 100000. 
# The voilin plot visualizes that most of the Other gender employees have salary above 150000. 
# In males this distribution is concentrated between 50000 and 100000 as well as near 200000. In case of females, there salary distribution is quite spread as compared to other genders with most near 50000.

In [None]:
# 3. Education Level and Salary
fig,ax = plt.subplots(1,2,figsize=(15,6))
sns.boxplot(x = 'Education Level', y = 'Salary', data = df, ax=ax[0]).set_title('Education Level Vs Salary')
sns.violinplot(x = 'Education Level', y = 'Salary', data = df, ax=ax[1]).set_title('Education Level Vs Salary')

## Output: The boxplot and violinplot shows the distribution of salary based on the employees education level. 
# The median salary for the Phd holders is highest followed by Masters then bachelors degreee holders. Employees with no degree have the lowest median salary. 
# With the violinplot; PhD scholars have distribution near 200000, the Masters degree holders have a very sleak distribution where the salary distribution is spread from 100k to 150k.
# The Bachelors degree holders have a salary distribution near 50000 whereas the employees with no degree have a salary distribution near 40k-45k.
# From these graph; The assumption that employees with higher education level have higher salary than employees with lower education level can be verified.

In [None]:
# 4. Job Title and Salary
sns.barplot(x = 'Job Title', y = 'Salary', data = df, palette = 'Set2')
plt.xticks(rotation = 90)

## Output: The graph falsifies the previous hypothesis regarding demand and pay with respect to job titles. 
# The 'Other' category job titles have higher salary than titles assumed to be in high demand and have higher pay. 
# In contrast to previous Job title graph, this graph shows that there is no relation between the job title distribution and salary. 
# The job titles which gave high salary are found to be less in number.
# The hypothesis is true about the Job titles such as Software Developer, Data analyst/scuentust and Manager/Director/VP - These job titles are found to be in high demand and pay.
# - But in contrast to that the job titles such as Operation/Supply chain, HR, Financial/Accountant and Marketing/Social Media are have much more salary than assumed.

In [None]:
# 5. Experience and Salary
sns.scatterplot(x= 'Years of Experience', y  = 'Salary', data = df).set_title('Years of Experience Vs Salary')

## Output: From the scaaterplot, it is clear that on the whole, the salary of the employees is increasing with the years of experience. 
# However, on closer look we can see that similar YOE have different salaries. This is because the salary is also dependent on other factors like job title, age, gender education level as discussed earlier.

In [None]:
# 6. Country and Salary
fig,ax = plt.subplots(1,2,figsize=(15,6))
sns.boxplot(x = 'Country', y = 'Salary', data = df, ax=ax[0])
sns.violinplot(x = 'Country', y = 'Salary', data = df, ax=ax[1])

## Output: Both the boxplot and violinplot shows very similar insight about the salary across all the countiries. 
# However, there is very small variation in median salary in USA, which is slighlty low median salary as compared to the other countries.

In [None]:
# From the above the we cannot get much information about the salary with respect to the countries. So, I will plot the job title vs salary graph for each country, so that we can get a overview of job title vs salary for each country.

fig,ax = plt.subplots(3,2,figsize=(20,20))
plt.subplots_adjust(hspace=0.5)
sns.boxplot(x = 'Job Title', y = 'Salary', data = df[df['Country'] == 'USA'], ax = ax[0,0]).set_title('USA')
ax[0,0].tick_params(axis='x', rotation=90)
sns.boxplot(x = 'Job Title', y = 'Salary', data = df[df['Country'] == 'UK'], ax = ax[0,1]).set_title('UK')
ax[0,1].tick_params(axis='x', rotation=90)
sns.boxplot(x = 'Job Title', y = 'Salary', data = df[df['Country'] == 'Canada'], ax = ax[1,0]).set_title('Canada')
ax[1,0].tick_params(axis='x', rotation=90)
sns.boxplot(x = 'Job Title', y = 'Salary', data = df[df['Country'] == 'Australia'], ax = ax[1,1]).set_title('Australia')
ax[1,1].tick_params(axis='x', rotation=90)
sns.boxplot(x = 'Job Title', y = 'Salary', data = df[df['Country'] == 'China'], ax = ax[2,0]).set_title('China')
ax[2,0].tick_params(axis='x', rotation=90)
sns.boxplot(x = 'Job Title', y = 'Salary', data = df, ax = ax[2,1]).set_title('All Countries')
ax[2,1].tick_params(axis='x', rotation=90)

## Output: After observing all these plots, I conclude that the Job Titles such as Software Developer, Manager/Director/VP and Data Analyst/Scientist are in high demand as well as receive much higher salary than other job titles, excluding the Job Titles that come under 'Other' category. The job titles such as Operation/Supply Chain, Customer Service/Receptionist, Product Designer and sales are in low demand and have low salary.

In [None]:
# 7. Race and Salary
fig,ax = plt.subplots(1,2,figsize=(15,6))
sns.boxplot(x = 'Race', y = 'Salary', data = df, ax = ax[0])
ax[0].tick_params(axis='x', rotation=90)
sns.violinplot(x = 'Race', y ='Salary', data = df, ax = ax[1])
ax[1].tick_params(axis='x', rotation=90)

##Output: The employees from the races - Mixed, Korean, Blacks and White have the highest median salary, followed by Asian and Australian then  Chinese, Welsh and African American while the employees from the hispanic race have the lowest median salary. 
# Looking at the violinplot the salary distribution is more concentrated after 125k in white, australian, black, Korean and mixed race. Whereas the hispanic has more concentration near 110k

In [None]:
###DATA PREPROCESSING 2

## Label encoding to categorical features
from sklearn.preprocessing import LabelEncoder
features = ['Gender','Country','Education Level','Job Title', 'Race']
le = LabelEncoder()
for feature in features:
    le.fit(df[feature].unique())
    df[feature] = le.transform(df[feature])
    print(feature, df[feature].unique())

In [None]:
## Normalization

# Normalizing the continuous variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age', 'Years of Experience', 'Salary']] = scaler.fit_transform(df[['Age', 'Years of Experience', 'Salary']])

df.head()

In [None]:
## Correlation Matrix Heatmap
# Correlation Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True, cmap='coolwarm')

## Output: From the correlation matrix, there are three major correlations.
#1. Salary and Age [0.73]
#2. Salary and Years of Experience [0.81]
#3. Years of Experience and Age [0.94]
## The correlation salary with age and years of experience is already explored in the above plots. 
# The coorelation between the years of experience and age is obvious as the person ages the experience will be more.

In [None]:
## Train_Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Salary', axis=1), df['Salary'], test_size=0.2, random_state=42)

In [None]:
##Salary Prediction
# Using the following models:
#1. Decision Tree Regressor
#2. Random Forest Regressor

## Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
## Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
## 1. DTR
# Creating the decision tree gressor object
dtree = DecisionTreeRegressor()

In [None]:
## Hypertuning the model
from sklearn.model_selection import GridSearchCV

In [None]:
# Defining the parameters for the grid search
parameters = {'max_depth' :[2,4,6,8,10],
              'min_samples_split' :[2,4,6,8],
              'min_samples_leaf' :[2,4,6,8],
              'max_features' :['auto','sqrt','log2'],
              'random_state' :[0,42]}

In [None]:
# Creating the grid search object
grid_search = GridSearchCV(dtree,parameters,cv=5,scoring='neg_mean_squared_error',n_jobs=-1)

In [None]:
# Fit the grid search object to the training data
grid_search.fit(X_train,y_train)

In [None]:
## Building the model on best parameters
dtree = DecisionTreeRegressor(max_depth = 10, max_features = 'auto', min_samples_leaf = 2, min_samples_split = 8, random_state = 42)
dtree

In [None]:
# Fitting the training data
dtree.fit(X_train,y_train)

In [None]:
# Predicting the salary of an employee 
d_pred = dtree.predict(X_test)

In [None]:
## Evaluating the Decision Tree Regressor Model
dft = pd.DataFrame({'Actual': y_test, 'Predicted': d_pred})
dft.reset_index(drop=True, inplace=True)
dft.head(10)

In [None]:
dft.tail(10)

In [None]:
ax = sns.distplot (dft['Actual'], color = 'blue', hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'Actual')
sns.distplot (dft['Predicted'], color = 'red', ax=ax, hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'Predicted')

## Output: The blue shows the distribution count for actual values and the red line shows the distribution count for predicted values.
# The predicted values are close to the actual values and their curve coincides with the actual values curve. This shows that the model is a good fit.


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

print("R2 Score: ", r2_score(y_test, d_pred))
print("Mean Squared Error: ", mean_squared_error(y_test, d_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, d_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, d_pred)))

In [None]:
## 2. RF
# Creating random forest regressor object
rfg = RandomForestRegressor()

In [None]:
# Trainig the model
rfg.fit(X_train, y_train)

In [None]:
# Training accuracy
rfg.score(X_train, y_train)

In [None]:
# Predicitng salary of the employee
r_pred = rfg.predict(X_test)

In [None]:
## Evaluating Random Forest Regressor Model
dfr = pd.DataFrame({'Actual': y_test, 'Predicted': r_pred})
dfr.reset_index(drop=True, inplace=True)
dfr.head(10)

In [None]:
dfr.tail(10)

In [None]:
ax = sns.distplot(dft['Actual'], color = 'blue', hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'Actual')
sns.distplot(  dft['Predicted'], color = 'red', ax=ax, hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'Predicted')

## Output: The blue shows the distribution count for actual values and the red line shows the distribution count for predicted values. 
# The predicted values are close to the actual values and their curve coincides with the actual values curve. This shows that the model is a good fit.

In [None]:
# Metrics
print("R2 Score: ", r2_score(y_test, r_pred))
print("Mean Squared Error: ", mean_squared_error(y_test, r_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, r_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, r_pred)))

In [None]:
## Conclusion
# From the exploratory data analysis, I have concluded that the salary of the employees is dependent upon the following factors:

#1. Years of Experience
#2. Job Title
#3. Education Level

# Employees with greater years of experience, having job title such as Data analyst/scientist, Software Developer or Director/Manager/VP and having a Master's or Doctoral degree are more likely to have a higher salary.

# Coming to the machine learning models, I have used regressor models - Decision Tree Regressor and Random Forest Regressor for predicting the salary. The Random Forest Regressor has performed better with the accuracy of 94.6%