In [20]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

df["job_posted_date"] = pd.to_datetime(df['job_posted_date'])

In [21]:
df["salary_year_avg"]

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
785736   NaN
785737   NaN
785738   NaN
785739   NaN
785740   NaN
Name: salary_year_avg, Length: 785741, dtype: float64

In [22]:
df[pd.notna(df["salary_year_avg"])]["salary_year_avg"]

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [23]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Literal['python', 'numba']" = 'python', engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
 

In [24]:
df_salary = df[pd.notna(df["salary_year_avg"])].copy()

def projected_salary(salary):
    return salary * 1.03

df_salary["salary_year_avg"].apply(projected_salary)

28        112785.00
77        144200.00
92        123600.00
100       235068.66
109        91670.00
            ...    
785624    143392.48
785641    154500.00
785648    228531.25
785682    162225.00
785692    162225.00
Name: salary_year_avg, Length: 22003, dtype: float64

In [25]:
df_salary["salary_year_inflated"] = df_salary["salary_year_avg"].apply(projected_salary)

df_salary[["salary_year_avg", "salary_year_inflated"]]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


OR

In [26]:
df_salary["salary_year_inflated"] = df_salary["salary_year_avg"].apply(lambda salary : salary * 1.03)

df_salary[["salary_year_avg", "salary_year_inflated"]]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


OR

In [27]:
df_salary["salary_year_inflated"] = df_salary["salary_year_avg"] * 1.03

df_salary[["salary_year_avg", "salary_year_inflated"]]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [33]:
df["job_skills"][0]

In [35]:
import ast

type(ast.literal_eval(df["job_skills"][0]))

ValueError: malformed node or string: None

In [36]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)
    
df["job_skills"] = df["job_skills"].apply(clean_list)

In [44]:
type(df["job_skills"][0])

NoneType

In [None]:
def projected_salary(row):
    if "Senior" in row["job_title_short"]:
        return 1.05 * row["salary_year_avg"]
    else:
        return 1.03 * row["salary_year_avg"]
    
df_salary["salary_year_inflated"] = df_salary.apply(projected_salary, axis=1)

df_salary[["job_title_short", "salary_year_avg", "salary_year_inflated"]]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00
