In [19]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  
import ast

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

In [2]:
df['salary_year_avg']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
785736   NaN
785737   NaN
785738   NaN
785739   NaN
785740   NaN
Name: salary_year_avg, Length: 785741, dtype: float64

In [3]:
#filter out NaN
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

we want to calculate the projected salary for next year based on the data we have, inflation at 3%

How can we get inside the data frame to apply the correct data

use the apply function

In [4]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(
    func: 'AggFuncType',
    axis: 'Axis' = 0,
    raw: 'bool' = False,
    result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None,
    args=(),
    by_row: "Literal[False, 'compat']" = 'compat',
    engine: "Literal['python', 'numba']" = 'python',
    engine_kwargs: 'dict[str, bool] | None' = None,
    **kwargs
) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.

    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.

    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along wh

In [6]:
df_salary = df[pd.notna(df['salary_year_avg'])]

#filtered out NaN values

def projected_salary(salary):
    return salary * 1.03

#here we created a function

df_salary['salary_year_avg'].apply(projected_salary)

#now we apply the data 'salary_year_avg' to our new function 'projected_salary'

28        112785.00
77        144200.00
92        123600.00
100       235068.66
109        91670.00
            ...    
785624    143392.48
785641    154500.00
785648    228531.25
785682    162225.00
785692    162225.00
Name: salary_year_avg, Length: 22003, dtype: float64

In [7]:
#let's see how it compares to the original salary year avg column

df_salary = df[pd.notna(df['salary_year_avg'])].copy()

#anytime we're doing alterations on an original datafframe use .copy

def projected_salary(salary):
    return salary * 1.03

#create a new column, 'salary_year_inflated

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']]

#now we apply the data 'salary_year_avg' to our new function 'projected_salary'

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [8]:
#let's write this again with an anonymous function

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [9]:
#we could technically write this without that .apply method or lambda function

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


EXAMPLE 3

In [10]:
df['job_skills']

0                                                      None
1         ['r', 'python', 'sql', 'nosql', 'power bi', 't...
2         ['python', 'sql', 'c#', 'azure', 'airflow', 'd...
3         ['python', 'c++', 'java', 'matlab', 'aws', 'te...
4         ['bash', 'python', 'oracle', 'aws', 'ansible',...
                                ...                        
785736    ['bash', 'python', 'perl', 'linux', 'unix', 'k...
785737                       ['sas', 'sas', 'sql', 'excel']
785738                              ['powerpoint', 'excel']
785739    ['python', 'go', 'nosql', 'sql', 'mongo', 'she...
785740                                      ['aws', 'flow']
Name: job_skills, Length: 785741, dtype: object

In [11]:
df['job_skills'][0]

In [12]:
#let's use this .apply method where we can actually apply it and simp;lify something like in the case of the job skills
#let's convert a string to a list
#applying the type method we see 'job_skills' comes up as a string

type(df['job_skills'][1])

str

In [13]:
#use ast (abstract, syntax, trees) with this we provide a node or string and it turns it converts it to the container data type

import ast

type(ast.literal_eval(df['job_skills'][1]))

list

In [14]:
def clean_list(skill_list):
    if pd.notna(skill_list):
     return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [15]:
df['job_skills'].head(1)

0    None
Name: job_skills, dtype: object

In [16]:
type(df['job_skills'][1])

list

In [1]:
df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)

NameError: name 'df' is not defined

NEW EXAMPLE

Let's assume for senior roles they have an inflation of 5% higher instead of 3%

In [88]:
def projected_salary(row):
    if 'Senior' in row['job_title_short']:
        return  1.05 * row['salary_year_avg']
    else:
        return  1.03 * row['salary_year_avg']

df['salary_year_inflated'] = df.apply(projected_salary, axis=1)

df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'salary_year_inflated']]


Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


In [89]:
#Technically you could write this with a lambda function:
df['salary_year_inflated'] = df.apply(lambda row: 1.05 * row['salary_year_avg'] if 'Senior' in row['job_title_short'] else 1.03 * row['salary_year_avg'], axis=1)

df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


rewatch this section at 7 hours and 40 minutes
