**APPLYING FUNCTION USING APPLY()**
* **Syntax:** df.apply(function/lambda...)
* **Notice:** function is usually creator-made


In [1]:
#Loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

#Loading data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()
df

#Clean data
df.job_posted_date = pd.to_datetime(df.job_posted_date)

**Exercise 1: Apply a 1.03 increase in the salary_year_avarage**

In [2]:
#Method 1: Create your own function
def inflation_calculation (salary):
    return salary * 1.03

salary_only = df[df['salary_year_avg'].notna()].copy()
salary_only['salary_year_inflation'] = salary_only['salary_year_avg'].apply(inflation_calculation)
salary_only[['salary_year_avg','salary_year_inflation']]

#Method 2: Using lambda
salary_only['salary_year_avg'].apply(lambda salary: salary * 1.03)


28        112785.00
77        144200.00
92        123600.00
100       235068.66
109        91670.00
            ...    
785624    143392.48
785641    154500.00
785648    228531.25
785682    162225.00
785692    162225.00
Name: salary_year_avg, Length: 22003, dtype: float64

**Exercise 2: Changing strings into list**

In [3]:
#Method 1
import ast
ast.literal_eval(df['job_skills'][2]) #change strings into lists
def list_converter(string):
    if pd.notna(string):
        return ast.literal_eval(string)


df_clean = df['job_skills'].apply(list_converter)


In [4]:
#Method 2
df['job_skills'].apply(lambda skill:ast.literal_eval(skill) if pd.notna(skill) else skill)

0                                                      None
1                [r, python, sql, nosql, power bi, tableau]
2         [python, sql, c#, azure, airflow, dax, docker,...
3         [python, c++, java, matlab, aws, tensorflow, k...
4         [bash, python, oracle, aws, ansible, puppet, j...
                                ...                        
785736    [bash, python, perl, linux, unix, kubernetes, ...
785737                               [sas, sas, sql, excel]
785738                                  [powerpoint, excel]
785739    [python, go, nosql, sql, mongo, shell, mysql, ...
785740                                          [aws, flow]
Name: job_skills, Length: 785741, dtype: object

In [5]:
df[df['job_skills'].notna()]

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."
5,Data Engineer,GCP Data Engineer,Anywhere,via ZipRecruiter,Contractor and Temp work,True,Georgia,2023-11-07 14:01:59,False,False,United States,,,,smart folks inc,"['python', 'sql', 'gcp']","{'cloud': ['gcp'], 'programming': ['python', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785736,Software Engineer,DevOps Engineer,Singapura,melalui Trabajo.org,Pekerjaan tetap,False,Singapore,2023-03-13 06:16:16,False,False,Singapore,,,,CAREERSTAR INTERNATIONAL PTE. LTD.,"['bash', 'python', 'perl', 'linux', 'unix', 'k...","{'os': ['linux', 'unix'], 'other': ['kubernete..."
785737,Data Analyst,CRM Data Analyst,"Bad Rodach, Jerman",melalui BeBee Deutschland,Pekerjaan tetap,False,Germany,2023-03-12 06:18:18,False,False,Germany,,,,HABA FAMILYGROUP,"['sas', 'sas', 'sql', 'excel']","{'analyst_tools': ['sas', 'excel'], 'programmi..."
785738,Business Analyst,Commercial Analyst - Start Now,Malaysia,melalui Ricebowl,Pekerjaan tetap,False,Malaysia,2023-03-12 06:32:36,False,False,Malaysia,,,,Lendlease Corporation,"['powerpoint', 'excel']","{'analyst_tools': ['powerpoint', 'excel']}"
785739,Data Engineer,"Principal Associate, Data Engineer (Remote-Eli...","Newark, New Jersey, Amerika Serikat",melalui Recruit.net,Pekerjaan tetap,False,Sudan,2023-03-12 06:32:15,False,False,Sudan,,,,Capital One,"['python', 'go', 'nosql', 'sql', 'mongo', 'she...","{'cloud': ['aws', 'snowflake', 'azure', 'redsh..."


**Exercise 3: Adjusting diffent multiplier for Senior and other role**
* **For senior role:** 1.05
* **For other role:** 1.03


**Note:** **df['column'].str.contains()** can be used to print the dataframe only containing columns that has specific string in it

In [None]:

df_salary = df[df['salary_year_avg'].notna()].copy()
def multiplier(row):
    if "Senior" in row['job_title_short']:
        return row['salary_year_avg']*1.05
    else:
        return row['salary_year_avg']*1.03


df_salary['salary_adjusted'] = df_salary.apply(multiplier,axis = 1)

senior_df = df_salary[df_salary['job_title_short'].str.contains("Senior", case=False, na=False)]
senior_df[['job_title_short','salary_year_avg','salary_adjusted']]
senior_df[senior_df['salary_adjusted'] == 1.05 * senior_df['salary_year_avg'].isna()] # Check if the result is all true


Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills,salary_adjusted
495,Senior Data Engineer,Senior Software Engineer (Data),"San Francisco, CA",via LinkedIn,Full-time,False,"Illinois, United States",2023-06-20 13:07:37,False,True,United States,year,168500.0,,hackajob,"['scala', 'gcp', 'azure', 'spark', 'kafka', 'h...","{'cloud': ['gcp', 'azure'], 'libraries': ['spa...",176925.000
573,Senior Data Engineer,Senior Python Data Engineer,"Wilmington, DE",via Indeed,Full-time,False,Sudan,2023-09-16 13:13:50,False,False,Sudan,year,160000.0,,Crackajack Solutions,"['python', 'sql', 'java', 'aws', 'databricks',...","{'cloud': ['aws', 'databricks', 'redshift'], '...",168000.000
657,Senior Data Engineer,Senior Data Engineer | Series D Video Analytic...,"Culver City, CA",via LinkedIn,Full-time,False,Georgia,2023-10-09 14:07:46,False,True,United States,year,165000.0,,Coda Search│Staffing,"['python', 'scala', 'sql', 'aws', 'redshift', ...","{'cloud': ['aws', 'redshift'], 'libraries': ['...",173250.000
726,Senior Data Engineer,Senior Data Engineer (Hybrid),"Washington, DC",via Linux Careers,Full-time,False,"California, United States",2023-05-02 13:09:09,False,True,United States,year,173500.0,,Capital One,"['java', 'scala', 'python', 'nosql', 'sql', 's...","{'cloud': ['redshift', 'snowflake', 'aws', 'az...",182175.000
733,Senior Data Engineer,Senior Data Engineer,"Oakland, CA",via LinkedIn,Full-time,False,Sudan,2023-07-06 13:41:35,False,False,Sudan,year,160000.0,,X4 Life Sciences,"['python', 'sql', 'postgresql', 'sql server', ...","{'cloud': ['aws', 'snowflake'], 'databases': [...",168000.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784947,Senior Data Engineer,Senior Data Engineer,"Montreal, QC, Canada",via Ladders,Full-time,False,Canada,2023-01-11 06:31:46,False,False,Canada,year,150000.0,,Harnham,"['python', 'aws', 'gcp', 'azure', 'redshift', ...","{'cloud': ['aws', 'gcp', 'azure', 'redshift'],...",157500.000
785106,Senior Data Engineer,Senior Data Engineer,"Washington, DC",via Ladders,Full-time,False,Sudan,2023-01-13 07:23:01,True,True,Sudan,year,125000.0,,Athletic Greens,"['sql', 'nosql', 'python', 'snowflake', 'bigqu...","{'cloud': ['snowflake', 'bigquery'], 'librarie...",131250.000
785132,Senior Data Scientist,Senior Database Designer / Data Modeler,"Mt Laurel Township, NJ",via Ai-Jobs.net,Full-time,False,Georgia,2023-07-09 06:30:42,True,False,United States,year,99150.0,,Arthur Grand Technologies Inc,"['sql', 'oracle', 'azure']","{'cloud': ['oracle', 'azure'], 'programming': ...",104107.500
785330,Senior Data Scientist,Senior Data Scientist - TikTok US - Tech Services,"Los Angeles, CA",via LinkedIn,Full-time,False,"California, United States",2023-01-11 06:20:33,False,True,United States,year,200935.5,,TikTok,"['sql', 'python', 'r']","{'programming': ['sql', 'python', 'r']}",210982.275


**EXTRA EXERCISES** 

In [43]:

data = {
    'job_title': ["Senior Engineer", "Project Manager", "Junior Analyst",
                  "Senior Data Scientist", "Data Engineer", "Software Developer Manager"],
    'salary': [80000, 95000, 50000, 120000, 110000, 75000]
}
df_new = pd.DataFrame(data)

#Create a function to increase the salary to Manager
def manager(row):
    if "Manager" in row['job_title']:
        return row['salary']*1.1
    else:
        return row['salary']
df_new['new_salary'] = df_new.apply(manager,axis = 1)
df_new


Unnamed: 0,job_title,salary,new_salary
0,Senior Engineer,80000,80000.0
1,Project Manager,95000,104500.0
2,Junior Analyst,50000,50000.0
3,Senior Data Scientist,120000,120000.0
4,Data Engineer,110000,110000.0
5,Software Developer Manager,75000,82500.0


In [53]:
def classification(row):
    if row['salary']<60000:
        return 'Low'
    if 60000 <= row['salary'] <= 90000:
        return 'Medium'
    else:
        return 'High'
    
df_new['classification']=df_new.apply(classification,axis = 1)
df_new

Unnamed: 0,job_title,salary,new_salary,classification
0,Senior Engineer,80000,80000.0,Medium
1,Project Manager,95000,104500.0,High
2,Junior Analyst,50000,50000.0,Low
3,Senior Data Scientist,120000,120000.0,High
4,Data Engineer,110000,110000.0,High
5,Software Developer Manager,75000,82500.0,Medium


In [91]:
df_new.loc[df_new['job_title'].str.contains("Manager", na=False), 'updated_salary'] = df_new['salary'] * 1.10
df_new


Unnamed: 0,job_title,salary,updated_salary
0,Senior Engineer,80000,
1,Project Manager,95000,104500.0
2,Junior Analyst,50000,
3,Senior Data Scientist,120000,
4,Data Engineer,110000,
5,Software Developer Manager,75000,82500.0
