In [132]:
import pandas as pd

salary = pd.read_csv("ds_salaries.csv")

In [133]:
salary.shape

(3755, 11)

In [134]:
salary.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [135]:
salary.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


In [136]:
salary.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [137]:
salary["job_title"].unique()

array(['Principal Data Scientist', 'ML Engineer', 'Data Scientist',
       'Applied Scientist', 'Data Analyst', 'Data Modeler',
       'Research Engineer', 'Analytics Engineer',
       'Business Intelligence Engineer', 'Machine Learning Engineer',
       'Data Strategist', 'Data Engineer', 'Computer Vision Engineer',
       'Data Quality Analyst', 'Compliance Data Analyst',
       'Data Architect', 'Applied Machine Learning Engineer',
       'AI Developer', 'Research Scientist', 'Data Analytics Manager',
       'Business Data Analyst', 'Applied Data Scientist',
       'Staff Data Analyst', 'ETL Engineer', 'Data DevOps Engineer',
       'Head of Data', 'Data Science Manager', 'Data Manager',
       'Machine Learning Researcher', 'Big Data Engineer',
       'Data Specialist', 'Lead Data Analyst', 'BI Data Engineer',
       'Director of Data Science', 'Machine Learning Scientist',
       'MLOps Engineer', 'AI Scientist', 'Autonomous Vehicle Technician',
       'Applied Machine Learning Sc

In [138]:
JOB_TYPES = {
    "Machine Learning Engineer": ["machine learning", "autonomous", "ai", "mlops", "computer vision", "deep learning", "ml", "nlp", "research engineer"],
    "Data Scientist": ["science", "scientist"],
    "Data Architect": ["architect"],
    "Data Engineer": ["data engineer", "etl", "engineer"],
    "Data Analyst": ["analyst", "analytics", "bi", "business intelligence"],
}

def map_job_title(title):
    title = title.lower()
    for job_type, keywords in JOB_TYPES.items():
        if any(keyword in title for keyword in keywords):
            return job_type
    return "Other"

salary["Job_Type"] = [map_job_title(title) for title in salary["job_title"]]

In [139]:
salary["Job_Type"].shape

(3755,)

In [140]:
salary[((salary.Job_Type == 'Data Architect') & (salary.experience_level == 'MI')) | ((salary.Job_Type == 'Data Architect') & (salary.experience_level == 'EN'))]


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Job_Type
465,2023,MI,FT,Data Architect,167500,USD,167500,US,0,US,M,Data Architect
466,2023,MI,FT,Data Architect,106500,USD,106500,US,0,US,M,Data Architect
523,2023,MI,FT,Data Architect,167500,USD,167500,US,0,US,M,Data Architect
524,2023,MI,FT,Data Architect,106500,USD,106500,US,0,US,M,Data Architect
3509,2021,MI,FT,Data Architect,170000,USD,170000,US,100,US,L,Data Architect
3510,2021,MI,FT,Data Architect,150000,USD,150000,US,100,US,L,Data Architect
3611,2021,MI,FT,Data Architect,180000,USD,180000,US,100,US,L,Data Architect


In [141]:
df = salary.loc[:, ['work_year', 'experience_level', 'employment_type', 'salary_in_usd', 'employee_residence', 'remote_ratio', 'company_location', 'company_size', 'Job_Type']]


In [142]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Job_Type
0,2023,SE,FT,85847,ES,100,ES,L,Data Scientist
1,2023,MI,CT,30000,US,100,US,S,Machine Learning Engineer
2,2023,MI,CT,25500,US,100,US,S,Machine Learning Engineer
3,2023,SE,FT,175000,CA,100,CA,M,Data Scientist
4,2023,SE,FT,120000,CA,100,CA,M,Data Scientist


In [143]:
df.shape

(3755, 9)

In [144]:
df[(df.employee_residence == 'US') & (df.company_location == 'US') & (df.employment_type == 'FT')]


Unnamed: 0,work_year,experience_level,employment_type,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Job_Type
5,2023,SE,FT,222200,US,0,US,L,Data Scientist
6,2023,SE,FT,136000,US,0,US,L,Data Scientist
9,2023,SE,FT,147100,US,0,US,M,Data Scientist
10,2023,SE,FT,90700,US,0,US,M,Data Scientist
11,2023,SE,FT,130000,US,100,US,M,Data Analyst
...,...,...,...,...,...,...,...,...,...
3747,2021,MI,FT,423000,US,50,US,L,Machine Learning Engineer
3749,2021,SE,FT,165000,US,100,US,L,Other
3750,2020,SE,FT,412000,US,100,US,L,Data Scientist
3751,2021,MI,FT,151000,US,100,US,L,Data Scientist


In [145]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
Job_Type              0
dtype: int64

In [146]:
df["remote_ratio"].unique()

array([100,   0,  50], dtype=int64)

In [147]:
df["Job_Type"].unique()

array(['Data Scientist', 'Machine Learning Engineer', 'Data Analyst',
       'Other', 'Data Engineer', 'Data Architect'], dtype=object)

In [148]:
df.dtypes

work_year              int64
experience_level      object
employment_type       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
Job_Type              object
dtype: object

In [149]:
new_df = df[(df.employee_residence == 'US') & (df.company_location == 'US') & (df.employment_type == 'FT')].reset_index(drop=True)

In [150]:
new_df.shape

(2991, 9)

In [151]:
newer_df = new_df.drop(['employment_type', 'company_location', 'employee_residence', 'remote_ratio'], axis=1).copy()
newer_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
0,2023,SE,222200,L,Data Scientist
1,2023,SE,136000,L,Data Scientist
2,2023,SE,147100,M,Data Scientist
3,2023,SE,90700,M,Data Scientist
4,2023,SE,130000,M,Data Analyst


In [152]:
newer_df_encoded = pd.get_dummies(newer_df, columns=['work_year', 'experience_level', 'company_size', 'Job_Type'])


In [153]:
newer_df_encoded.dtypes

salary_in_usd                         int64
work_year_2020                         bool
work_year_2021                         bool
work_year_2022                         bool
work_year_2023                         bool
experience_level_EN                    bool
experience_level_EX                    bool
experience_level_MI                    bool
experience_level_SE                    bool
company_size_L                         bool
company_size_M                         bool
company_size_S                         bool
Job_Type_Data Analyst                  bool
Job_Type_Data Architect                bool
Job_Type_Data Engineer                 bool
Job_Type_Data Scientist                bool
Job_Type_Machine Learning Engineer     bool
Job_Type_Other                         bool
dtype: object

In [154]:
newer_df_encoded.corr()

Unnamed: 0,salary_in_usd,work_year_2020,work_year_2021,work_year_2022,work_year_2023,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,company_size_L,company_size_M,company_size_S,Job_Type_Data Analyst,Job_Type_Data Architect,Job_Type_Data Engineer,Job_Type_Data Scientist,Job_Type_Machine Learning Engineer,Job_Type_Other
salary_in_usd,1.0,0.007265,-0.03187,-0.07305,0.081775,-0.206211,0.17543,-0.181472,0.192525,0.031159,-0.003998,-0.069018,-0.312019,0.037055,0.005651,0.158257,0.144609,-0.030048
work_year_2020,0.007265,1.0,-0.015511,-0.081599,-0.096051,0.072964,0.004117,0.080222,-0.1087,0.185474,-0.208716,0.092456,0.019269,-0.016898,-0.035848,0.029835,0.000852,-0.0116
work_year_2021,-0.03187,-0.015511,1.0,-0.150155,-0.176749,0.108524,-0.007668,0.104073,-0.14312,0.333036,-0.404379,0.246602,0.013742,0.014641,-0.024367,0.003341,0.001821,0.011387
work_year_2022,-0.07305,-0.081599,-0.150155,1.0,-0.929853,-0.048362,-0.028143,-0.016451,0.051274,0.035532,-0.025548,-0.021578,-0.01101,0.002838,0.026433,0.021147,-0.053338,-0.009433
work_year_2023,0.081775,-0.096051,-0.176749,-0.929853,1.0,-0.00091,0.029749,-0.032492,0.015906,-0.178646,0.196418,-0.076495,0.002914,-0.004556,-0.011731,-0.027559,0.05227,0.007747
experience_level_EN,-0.206211,0.072964,0.108524,-0.048362,-0.00091,1.0,-0.044563,-0.106729,-0.425666,0.110208,-0.163312,0.161882,0.053839,-0.045041,-0.019559,-0.029877,0.032543,-0.007634
experience_level_EX,0.17543,0.004117,-0.007668,-0.028143,0.029749,-0.044563,1.0,-0.07942,-0.316749,-0.034118,0.019982,0.033102,-0.070987,-0.012205,0.098962,-0.016902,-0.048588,0.053253
experience_level_MI,-0.181472,0.080222,0.104073,-0.016451,-0.032492,-0.106729,-0.07942,1.0,-0.758615,0.047535,-0.04917,0.011934,0.121748,-0.044394,-0.038842,-0.077595,0.034248,0.018259
experience_level_SE,0.192525,-0.1087,-0.14312,0.051274,0.015906,-0.425666,-0.316749,-0.758615,1.0,-0.085285,0.120814,-0.110125,-0.10306,0.066694,0.003023,0.088664,-0.026613,-0.033128
company_size_L,0.031159,0.185474,0.333036,0.035532,-0.178646,0.110208,-0.034118,0.047535,-0.085285,1.0,-0.930131,-0.032719,-0.032809,-0.000856,-0.068155,0.08673,0.015011,0.010753


In [155]:
from sklearn.model_selection import train_test_split

# Split the data into X (features) and y (target)
X = newer_df_encoded.drop('salary_in_usd', axis=1)
y = newer_df_encoded['salary_in_usd']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [156]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Initialize the model with 28 trees
rf_model = RandomForestRegressor(n_estimators=28, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_model.predict(X_test)

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.22223930959806326
