In [52]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

#Loading the dataset
print("Loading the Dataset...")
dataset = load_dataset("lukebarousse/data_jobs")
print("Dataset Loaded Successfully...")

#converting dataset to a data frame
df = dataset["train"].to_pandas()
print("Data Frame Created...")

############ Cleaning
df["job_posted_date"] = pd.to_datetime(df["job_posted_date"])
df.drop(["salary_hour_avg"], axis=1, inplace=True)
df = df[df["job_title_short"].notna()]
df = df[df["salary_year_avg"].notna()]
df["joining_month"] = df["job_posted_date"].dt.month
print("Data Cleaned...")

Loading the Dataset...
Dataset Loaded Successfully...
Data Frame Created...
Data Cleaned...


In [53]:
# apply() method is used to apply a function (set of operations) on either each row of a df, or each column of a df or on each value in a column of a df

# Lets say we need to increase the salary_year_avg of each by 3% 

def increment_by_three_percent(salary):
    return salary * 1.03

df["incremented_salary_year_avg"] = df["salary_year_avg"].apply(increment_by_three_percent)

# NOTES
# apply() function is like a forEach() loop, but more powerful.. It can be used to itereate each row, each column or values in a specific column

In [54]:
# Exercise - 2.. 
# Now lets say we need to increment the salary_year_avg of senior roles by 5% and all others by 3%
# For this purpose, we need to check if the word "Senior" is in the job_title_short and if so, we need to increment the salary by 5%

# Lets delete the incremented_salary_year_avg column
df.drop("incremented_salary_year_avg", inplace=True, axis=1)

In [55]:
# Lets implement the actual logic here
def increment_avg_salary(row):
    if("Senior" in row["job_title_short"]):
        return row["salary_year_avg"] * 1.05
    else:
        return row["salary_year_avg"] * 1.03

# since we need to pass each row i.e. the operation is to be applied on each row, we use axis = 1
df["increment_salary_year_avg"] = df.apply(increment_avg_salary, axis=1)
df

# Lets say we need to perform some operations on each column, then we need to use axis = 0
# Note that the aggregate functions like sum(), min(), max(), count(), mean(), median(), etc can be applied on each row


Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,company_name,job_skills,job_type_skills,joining_month,increment_salary_year_avg
28,Data Scientist,CRM Data Specialist,"San José Province, San José, Costa Rica",via Ai-Jobs.net,Full-time,False,Costa Rica,2023-08-01 13:37:57,False,False,Costa Rica,year,109500.0,Netskope,"['gdpr', 'excel']","{'analyst_tools': ['excel'], 'libraries': ['gd...",8,112785.00
77,Data Engineer,Data Engineer,"Arlington, VA",via LinkedIn,Full-time,False,Sudan,2023-06-26 14:22:54,False,False,Sudan,year,140000.0,Intelletec,"['mongodb', 'mongodb', 'python', 'r', 'sql', '...","{'analyst_tools': ['tableau'], 'cloud': ['orac...",6,144200.00
92,Data Engineer,Remote - Data Engineer - Permanent - W2,Anywhere,via LinkedIn,Full-time,True,"Illinois, United States",2023-02-21 13:29:59,False,True,United States,year,120000.0,Apex Systems,"['sql', 'python']","{'programming': ['sql', 'python']}",2,123600.00
100,Data Scientist,"Data Scientist, Risk Data Mining - USDS","Mountain View, CA",via LinkedIn,Full-time,False,"California, United States",2023-07-31 13:01:18,False,True,United States,year,228222.0,TikTok,"['sql', 'r', 'python', 'express']","{'programming': ['sql', 'r', 'python'], 'webfr...",7,235068.66
109,Data Analyst,Senior Supply Chain Analytics Analyst,Anywhere,via Get.It,Full-time,True,"Illinois, United States",2023-10-12 13:02:19,False,True,United States,year,89000.0,Get It Recruit - Transportation,"['python', 'r', 'alteryx', 'tableau']","{'analyst_tools': ['alteryx', 'tableau'], 'pro...",10,91670.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785624,Data Engineer,Data Analytics Engineer (Hybrid),"Mt Prospect, IL",via Ai-Jobs.net,Full-time,False,"Illinois, United States",2023-08-31 06:02:16,False,True,United States,year,139216.0,Bosch Group,"['go', 'python', 'r', 'sql', 'oracle', 'window...","{'analyst_tools': ['alteryx', 'power bi', 'tab...",8,143392.48
785641,Data Engineer,Data Engineer,"New York, NY",via Dice,Full-time,False,Georgia,2023-01-04 16:36:07,True,False,United States,year,150000.0,"Engage Partners, Inc.",,,1,154500.00
785648,Data Scientist,Director Data Scientist - Commercial Platforms...,"Pleasant Hill, CA",via Ai-Jobs.net,Full-time,False,"California, United States",2023-04-12 06:02:51,False,True,United States,year,221875.0,84.51°,"['python', 'azure', 'snowflake', 'spark']","{'cloud': ['azure', 'snowflake'], 'libraries':...",4,228531.25
785682,Data Scientist,Data Scientist für datengetriebene Entwicklung...,"Reutlingen, Germany",via Ai-Jobs.net,Full-time,False,Germany,2023-03-04 06:16:08,False,False,Germany,year,157500.0,Bosch Group,"['python', 'hadoop', 'spark', 'airflow', 'kube...","{'libraries': ['hadoop', 'spark', 'airflow'], ...",3,162225.00


In [56]:
import ast
def str_to_list(str_list):
    if pd.notna(str_list):
        return ast.literal_eval(str_list)
    else:
        return str_list

df["job_skills"] = df["job_skills"].apply(str_to_list)


In [57]:
df.loc[28, "job_skills"]

['gdpr', 'excel']