# What are the most demanded skills for the top 3 most popular data roles?


### Methodology

1. Clean up skill column
2. Calculate skill count based on `job_title_short`
3. Calculate skill percentage
4. Plot final findings

In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
import ast 
import seaborn as sns

#Loading Data
df = pd.read_csv('/Users/robertocortez/Desktop/data_jobs.csv')

# Data Cleanup 
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

# Converting job_skills column from str to list
df['job_skills'] = df['job_skills'].apply(
    lambda skill_list: ast.literal_eval(skill_list) if isinstance(skill_list, str) and pd.notna(skill_list) else skill_list
) 

In [3]:
# Creating a new dataframe that filters for only jobs in the United States

df_US = df[df['job_country'] == 'United States'].copy()


In [5]:
# Using .explode() to seperate out the skills column
df_skills = df_US.explode('job_skills')
df_skills[['job_title_short', 'job_skills']]

Unnamed: 0,job_title_short,job_skills
0,Senior Data Engineer,
3,Data Engineer,python
3,Data Engineer,c++
3,Data Engineer,java
3,Data Engineer,matlab
...,...,...
785692,Data Scientist,r
785703,Data Analyst,
785705,Data Analyst,sql
785705,Data Analyst,python


In [9]:
# Using groupby to count the number of times a skill is mentioned in the job_skills column

df_skills_grouped = df_skills.groupby(['job_skills', 'job_title_short']).size()
df_skills_grouped = df_skills_grouped.reset_index(name='skill_count')
df_skills_grouped.sort_values(by='skill_count', ascending=False, inplace=True) 

df_skills_grouped

Unnamed: 0,job_skills,job_title_short,skill_count
1209,python,Data Scientist,42379
1521,sql,Data Analyst,34452
1523,sql,Data Scientist,30034
455,excel,Data Analyst,27519
1243,r,Data Scientist,26022
...,...,...,...
245,clojure,Software Engineer,1
1738,vb.net,Senior Data Scientist,1
530,fortran,Machine Learning Engineer,1
1116,planner,Cloud Engineer,1


In [14]:
# Creating a list of the top 3 job titles
job_titles = df_skills_grouped['job_title_short'].unique().tolist()
job_titles = sorted(job_titles[:3])
job_titles

['Data Analyst', 'Data Engineer', 'Data Scientist']

In [15]:

fig, ax - plt.subplots(len(job_titles), 1)

for i, title in enumerate(job_titles):
    data = df_skills_grouped[df_skills_grouped['job_title_short'] == title].head(10)
    sns.barplot(x='skill_count', y='job_skills', data=data, ax=ax[i])
    ax[i].set_title(title)

NameError: name 'fig' is not defined