In [112]:
!pip install hvplot



In [113]:
# Import packages
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [55]:
# Upload the "ds_salaries.csv" file into Colab, then store in a Pandas DataFrame
from google.colab import files
uploaded = files.upload()

In [95]:
# Read data science salaries
df_salaries = pd.read_csv(Path("./ds_salaries.csv"))
df_salaries.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [96]:
# Review the info
df_salaries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [97]:
# Display the DataFrame
df_salaries["remote_ratio"].value_counts()

0      1923
100    1643
50      189
Name: remote_ratio, dtype: int64

In [98]:
# Drop work_year and remote_ratio for now. Unable to solve int problem with column names.
df_salaries = df_salaries.drop(columns = ["work_year", "remote_ratio"])

In [76]:
# Transform the object columns using get_dummies. remote_ratio and work_year is converted
# work_year_dummies = pd.get_dummies(df_salaries["work_year"])
experience_level_dummies = pd.get_dummies(df_salaries["experience_level"])
employment_type_dummies = pd.get_dummies(df_salaries["employment_type"])
job_title_dummies = pd.get_dummies(df_salaries["job_title"])
salary_currency_dummies = pd.get_dummies(df_salaries["salary_currency"])
employee_residence_dummies = pd.get_dummies(df_salaries["employee_residence"])
# remote_ratio_dummies = pd.get_dummies(df_salaries["remote_ratio"])
company_location_dummies = pd.get_dummies(df_salaries["company_location"])
company_size_dummies = pd.get_dummies(df_salaries["company_size"])

In [99]:
# Concatenate the df_salaries and the dummies DataFrames
# df_salaries = pd.concat([df_salaries, work_year_dummies], axis=1)
df_salaries = pd.concat([df_salaries, experience_level_dummies], axis=1)
df_salaries = pd.concat([df_salaries, employment_type_dummies], axis=1)
df_salaries = pd.concat([df_salaries, job_title_dummies], axis=1)
df_salaries = pd.concat([df_salaries, salary_currency_dummies], axis=1)
df_salaries = pd.concat([df_salaries, employee_residence_dummies], axis=1)
# df_salaries = pd.concat([df_salaries, remote_ratio_dummies], axis=1)
df_salaries = pd.concat([df_salaries, company_location_dummies], axis=1)
df_salaries = pd.concat([df_salaries, company_size_dummies], axis=1)

# Drop the original education column
# df_salaries = df_salaries.drop(columns=["work_year"])
df_salaries = df_salaries.drop(columns=["experience_level"])
df_salaries = df_salaries.drop(columns=["employment_type"])
df_salaries = df_salaries.drop(columns=["job_title"])
df_salaries = df_salaries.drop(columns=["salary_currency"])
df_salaries = df_salaries.drop(columns=["employee_residence"])
# df_salaries = df_salaries.drop(columns=["remote_ratio"])
df_salaries = df_salaries.drop(columns=["company_location"])
df_salaries = df_salaries.drop(columns=["company_size"])

In [100]:
# Scaling the numeric columns
# Scale the DataFrame data for the continuous variables salary, salary_in_usd

salaries_scaled = StandardScaler().fit_transform(df_salaries[["salary", "salary_in_usd"]])

# Review the scaled data
salaries_scaled

array([[-0.16482684, -0.82039118],
       [-0.23927735, -1.70618745],
       [-0.2459779 , -1.77756251],
       ...,
       [-0.12760158, -0.51660304],
       [-0.13504663, -0.59590867],
       [10.13912397, -0.68052777]])

In [101]:
# Create a DataFrame of the scaled data
salaries_scaled = pd.DataFrame(salaries_scaled, columns=["salary", "salary_in_usd"])

In [102]:
# Replace the original data with the columns of information from the scaled Data
df_salaries["salary"] = salaries_scaled["salary"]
df_salaries["salary_in_usd"] = salaries_scaled["salary_in_usd"]

In [103]:
df_salaries


Unnamed: 0,salary,salary_in_usd,EN,EX,MI,SE,CT,FL,FT,PT,...,SI,SK,TH,TR,UA,US,VN,L,M,S
0,-0.164827,-0.820391,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,-0.239277,-1.706187,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,-0.245978,-1.777563,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,-0.023371,0.593676,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,-0.105266,-0.278686,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3750,0.329525,4.352762,0,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
3751,-0.059107,0.213009,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
3752,-0.127602,-0.516603,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
3753,-0.135047,-0.595909,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0


## Elbow Method to find k

In [104]:
# Import the KMeans module from SKLearn
from sklearn.cluster import KMeans

In [118]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [119]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_salaries)
    inertia.append(k_model.inertia_)



In [124]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow

Unnamed: 0,k,inertia
0,1,17223.518509
1,2,14141.310987
2,3,11999.182762
3,4,10515.67444
4,5,9310.552502
5,6,8605.780802
6,7,8125.72015
7,8,7685.527038
8,9,7325.555331
9,10,6970.840608


In [122]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [110]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=3)

# Fit the model
model.fit(df_salaries)

# Make predictions
k_3 = model.predict(df_salaries)

# Create a copy of the preprocessed data
salaries_predictions_df = df_salaries.copy()

# Add a class column with the labels
salaries_predictions_df['salary'] = k_3

