In [33]:
import pandas as pd

salary = pd.read_csv("ds_salaries.csv")

In [34]:
salary.shape

(3755, 11)

In [35]:
salary.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [36]:
salary.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


In [37]:
salary.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [38]:
salary["job_title"].unique()

array(['Principal Data Scientist', 'ML Engineer', 'Data Scientist',
       'Applied Scientist', 'Data Analyst', 'Data Modeler',
       'Research Engineer', 'Analytics Engineer',
       'Business Intelligence Engineer', 'Machine Learning Engineer',
       'Data Strategist', 'Data Engineer', 'Computer Vision Engineer',
       'Data Quality Analyst', 'Compliance Data Analyst',
       'Data Architect', 'Applied Machine Learning Engineer',
       'AI Developer', 'Research Scientist', 'Data Analytics Manager',
       'Business Data Analyst', 'Applied Data Scientist',
       'Staff Data Analyst', 'ETL Engineer', 'Data DevOps Engineer',
       'Head of Data', 'Data Science Manager', 'Data Manager',
       'Machine Learning Researcher', 'Big Data Engineer',
       'Data Specialist', 'Lead Data Analyst', 'BI Data Engineer',
       'Director of Data Science', 'Machine Learning Scientist',
       'MLOps Engineer', 'AI Scientist', 'Autonomous Vehicle Technician',
       'Applied Machine Learning Sc

In [39]:
JOB_TYPES = {
    "Machine Learning Engineer": ["machine learning", "autonomous", "ai", "mlops", "computer vision", "deep learning", "ml", "nlp", "research engineer"],
    "Data Scientist": ["science", "scientist"],
    "Data Architect": ["architect"],
    "Data Engineer": ["data engineer", "etl", "engineer"],
    "Data Analyst": ["analyst", "analytics", "bi", "business intelligence"],
}

def map_job_title(title):
    title = title.lower()
    for job_type, keywords in JOB_TYPES.items():
        if any(keyword in title for keyword in keywords):
            return job_type
    return "Other"

salary["Job_Type"] = [map_job_title(title) for title in salary["job_title"]]

In [40]:
salary["Job_Type"].shape

(3755,)

In [41]:
salary[((salary.Job_Type == 'Data Architect') & (salary.experience_level == 'MI')) | ((salary.Job_Type == 'Data Architect') & (salary.experience_level == 'EN'))]


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Job_Type
465,2023,MI,FT,Data Architect,167500,USD,167500,US,0,US,M,Data Architect
466,2023,MI,FT,Data Architect,106500,USD,106500,US,0,US,M,Data Architect
523,2023,MI,FT,Data Architect,167500,USD,167500,US,0,US,M,Data Architect
524,2023,MI,FT,Data Architect,106500,USD,106500,US,0,US,M,Data Architect
3509,2021,MI,FT,Data Architect,170000,USD,170000,US,100,US,L,Data Architect
3510,2021,MI,FT,Data Architect,150000,USD,150000,US,100,US,L,Data Architect
3611,2021,MI,FT,Data Architect,180000,USD,180000,US,100,US,L,Data Architect


In [42]:
df = salary.loc[:, ['work_year', 'experience_level', 'employment_type', 'salary_in_usd', 'employee_residence', 'remote_ratio', 'company_location', 'company_size', 'Job_Type']]


In [43]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Job_Type
0,2023,SE,FT,85847,ES,100,ES,L,Data Scientist
1,2023,MI,CT,30000,US,100,US,S,Machine Learning Engineer
2,2023,MI,CT,25500,US,100,US,S,Machine Learning Engineer
3,2023,SE,FT,175000,CA,100,CA,M,Data Scientist
4,2023,SE,FT,120000,CA,100,CA,M,Data Scientist


In [44]:
df.shape

(3755, 9)

In [45]:
df[(df.employee_residence == 'US') & (df.company_location == 'US') & (df.employment_type == 'FT')]


Unnamed: 0,work_year,experience_level,employment_type,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Job_Type
5,2023,SE,FT,222200,US,0,US,L,Data Scientist
6,2023,SE,FT,136000,US,0,US,L,Data Scientist
9,2023,SE,FT,147100,US,0,US,M,Data Scientist
10,2023,SE,FT,90700,US,0,US,M,Data Scientist
11,2023,SE,FT,130000,US,100,US,M,Data Analyst
...,...,...,...,...,...,...,...,...,...
3747,2021,MI,FT,423000,US,50,US,L,Machine Learning Engineer
3749,2021,SE,FT,165000,US,100,US,L,Other
3750,2020,SE,FT,412000,US,100,US,L,Data Scientist
3751,2021,MI,FT,151000,US,100,US,L,Data Scientist


In [46]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
Job_Type              0
dtype: int64

In [47]:
df["remote_ratio"].unique()

array([100,   0,  50], dtype=int64)

In [48]:
df["Job_Type"].unique()

array(['Data Scientist', 'Machine Learning Engineer', 'Data Analyst',
       'Other', 'Data Engineer', 'Data Architect'], dtype=object)

In [49]:
df.dtypes

work_year              int64
experience_level      object
employment_type       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
Job_Type              object
dtype: object

In [50]:
new_df = df[(df.employee_residence == 'US') & (df.company_location == 'US') & (df.employment_type == 'FT')].reset_index(drop=True)

In [51]:
new_df.shape

(2991, 9)

In [52]:
newer_df = new_df.drop(['employment_type', 'company_location', 'employee_residence', 'remote_ratio'], axis=1).copy()
newer_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
0,2023,SE,222200,L,Data Scientist
1,2023,SE,136000,L,Data Scientist
2,2023,SE,147100,M,Data Scientist
3,2023,SE,90700,M,Data Scientist
4,2023,SE,130000,M,Data Analyst


In [53]:
data_scientist_df = newer_df[newer_df.Job_Type == 'Data Scientist']
data_scientist_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
0,2023,SE,222200,L,Data Scientist
1,2023,SE,136000,L,Data Scientist
2,2023,SE,147100,M,Data Scientist
3,2023,SE,90700,M,Data Scientist
6,2023,EN,213660,L,Data Scientist


In [54]:
machine_learning_engineer_df = newer_df[newer_df.Job_Type == 'Machine Learning Engineer']
machine_learning_engineer_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
16,2023,SE,200000,M,Machine Learning Engineer
17,2023,SE,130000,M,Machine Learning Engineer
20,2023,SE,342810,M,Machine Learning Engineer
21,2023,SE,184590,M,Machine Learning Engineer
40,2023,SE,189110,M,Machine Learning Engineer


In [55]:
data_engineer_df = newer_df[newer_df.Job_Type == 'Data Engineer']
data_engineer_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
14,2023,SE,225000,M,Data Engineer
15,2023,SE,156400,M,Data Engineer
18,2023,SE,253200,M,Data Engineer
19,2023,SE,90700,M,Data Engineer
22,2023,MI,162500,M,Data Engineer


In [56]:
data_analyst_df = newer_df[newer_df.Job_Type == 'Data Analyst']
data_analyst_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
4,2023,SE,130000,M,Data Analyst
5,2023,SE,100000,M,Data Analyst
12,2023,MI,150000,M,Data Analyst
13,2023,MI,110000,M,Data Analyst
24,2023,MI,105380,M,Data Analyst


In [57]:
other_df = newer_df[newer_df.Job_Type == 'Other']
other_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
8,2023,SE,147100,M,Other
9,2023,SE,90700,M,Other
188,2023,EX,329500,M,Other
189,2023,EX,269600,M,Other
267,2023,SE,198800,M,Other


In [58]:
data_architect_df = newer_df[newer_df.Job_Type == 'Data Architect']
data_architect_df.head()

Unnamed: 0,work_year,experience_level,salary_in_usd,company_size,Job_Type
60,2023,SE,280100,M,Data Architect
61,2023,SE,168100,M,Data Architect
164,2023,SE,180000,M,Data Architect
165,2023,SE,115000,M,Data Architect
166,2023,SE,200000,M,Data Architect


In [59]:
data_architect_df_encoded = pd.get_dummies(data_architect_df, columns=['work_year', 'experience_level', 'company_size', 'Job_Type'])
other_df_encoded = pd.get_dummies(other_df, columns=['work_year', 'experience_level', 'company_size', 'Job_Type'])
data_analyst_df_encoded = pd.get_dummies(data_analyst_df, columns=['work_year', 'experience_level', 'company_size', 'Job_Type'])
data_engineer_df_encoded = pd.get_dummies(data_engineer_df, columns=['work_year', 'experience_level', 'company_size', 'Job_Type'])
machine_learning_engineer_df_encoded = pd.get_dummies(machine_learning_engineer_df, columns=['work_year', 'experience_level', 'company_size', 'Job_Type'])
data_scientist_df_encoded = pd.get_dummies(data_scientist_df, columns=['work_year', 'experience_level', 'company_size', 'Job_Type'])

# Almost done I realized that I could have done this earlier in the notebook


In [60]:
data_scientist_df_encoded.dtypes

salary_in_usd              int64
work_year_2020              bool
work_year_2021              bool
work_year_2022              bool
work_year_2023              bool
experience_level_EN         bool
experience_level_EX         bool
experience_level_MI         bool
experience_level_SE         bool
company_size_L              bool
company_size_M              bool
company_size_S              bool
Job_Type_Data Scientist     bool
dtype: object

In [61]:
from sklearn.model_selection import train_test_split

# Split the data into X (features) and y (target)
X_data_scientist = data_scientist_df_encoded.drop(columns=['salary_in_usd'])
y_data_scientist = data_scientist_df_encoded['salary_in_usd']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_data_scientist, y_data_scientist, test_size=0.2, random_state=42)


In [62]:
from sklearn.linear_model import LinearRegression

# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = model.predict(X_test)


In [63]:
from sklearn.metrics import r2_score

# Calculate the R-squared on the test data
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)


R-squared: 0.0559473070649843
