# AI Job Dashboard with Plotly Dash
## ABB #3 - Session 1

Code authored by: Shaw Talebi

### imports

In [1]:
import pandas as pd
import re
from collections import Counter

import plotly.express as px
from dash import dcc, html, Dash

### 1) import data

In [2]:
df = pd.read_csv("data/ai_job_data.csv")
df = df.dropna()
df.head()

Unnamed: 0,company_name,job_title,job_description,salary_min,salary_max
0,The Walt Disney Company,Software Engineer II,Job Posting Title:Software Engineer IIReq ID:1...,114900.0,154100.0
1,Leidos,Signal Processing Software Engineer,Do you want to join a high performing team tha...,67600.0,122200.0
3,Crinetics Pharmaceuticals,"Director, Data Management NDC",Crinetics is a pharmaceutical company based in...,189000.0,236000.0
4,Crinetics Pharmaceuticals,Senior Clinical Data Management Manager,Crinetics is a pharmaceutical company based in...,114000.0,143000.0
5,Geosyntec Consultants,Data Management Intern,Overview Do you want to build an impactful car...,40220.0,59000.0


### 2) pre-process data

#### standardize job titles

In [3]:
def standardize_job_title(title):
    """
    This script standardizes a list of job titles into 10 unique role categories using keyword matching.
    It categorizes roles such as Data Engineer, Software Engineer, Machine Learning Engineer, 
    Data Analyst, and others based on common keywords found in the job titles.
    
    Functions:
    - standardize_job_title(title): Maps job titles to predefined categories.
    - The script also generates a dataframe displaying the original and standardized titles.
    
    Example Usage:
      titles = ["Software Engineer II", "Data Engineer"]
      standardized_titles = [standardize_job_title(title) for title in titles]
    """
    title = title.lower()
    if any(keyword in title for keyword in ["data engineer", "databricks", "pipeline"]):
        return "Data Engineer"
    elif any(keyword in title for keyword in ["software engineer", "developer", "cno developer"]):
        return "Software Engineer"
    elif any(keyword in title for keyword in ["machine learning", "ai", "ml", "model"]):
        return "Machine Learning Engineer"
    elif any(keyword in title for keyword in ["data analyst", "analytics consultant", "business intelligence"]):
        return "Data Analyst"
    elif any(keyword in title for keyword in ["data governance", "data management", "data strategy"]):
        return "Data Governance & Management"
    elif any(keyword in title for keyword in ["business analyst", "solution consultant", "business execution", "operations"]):
        return "Business Analyst"
    elif any(keyword in title for keyword in ["director", "head", "principal", "senior director", "associate director"]):
        return "Director / Executive"
    elif any(keyword in title for keyword in ["systems engineer", "mechatronics", "robotics", "signal processing"]):
        return "Systems & Robotics Engineer"
    elif any(keyword in title for keyword in ["clinical", "healthcare", "policy", "sanctions screening", "compliance"]):
        return "Healthcare & Compliance Analyst"
    elif "intern" in title or "apprentice" in title:
        return "Intern / Apprentice"
    else:
        return "Other"

In [4]:
# Apply the regex-based function to the job_title column
df['standardized_job_title'] = df['job_title'].apply(standardize_job_title)

print("Num raw job titles:", df['job_title'].nunique())
print("Num standardized job titles:", df['standardized_job_title'].nunique())

Num raw job titles: 42
Num standardized job titles: 9


In [5]:
# sum max salary for each standardized job title
s_jobs = df.groupby('standardized_job_title')['salary_max'].mean()
s_jobs = s_jobs.sort_values()

# convert to dataframe
df_jobs = s_jobs.reset_index()
df_jobs.columns = ["Job Title", "Mean Salary"]
df_jobs.head()

Unnamed: 0,Job Title,Mean Salary
0,Other,67290.0
1,Systems & Robotics Engineer,98560.0
2,Software Engineer,136242.666667
3,Data Analyst,138002.0
4,Business Analyst,144546.0


#### extract common skills from JD

In [6]:
def extract_skills(description):
    """
    Extracts AI-related skills from a given job description.

    Args:
        description (str): The job description text to search for skills.

    Returns:
        list: A list of skills found in the job description, matched from a predefined set of common AI-related skills.

    Notes:
        - The function defines a list of common AI-related skills, including programming languages, frameworks,
          cloud platforms, and statistical concepts.
        - The input description is converted to lowercase to ensure case-insensitive matching.
        - Skills are detected using regular expressions to match whole words, avoiding partial matches (e.g., 
          "spark" will not match "sparking").
    """
    
    # Define a list of common AI-related skills
    skills_list = [
        "python", "r", "java", "c++", "sql", "scala", "spark", "hadoop", "tensorflow", "pytorch",
        "keras", "scikit-learn", "machine learning", "deep learning", "nlp", "natural language processing",
        "computer vision", "data analysis", "data engineering", "big data", "ai", "artificial intelligence",
        "cloud", "aws", "azure", "gcp", "docker", "kubernetes", "linux", "flask", "django", "pandas",
        "numpy", "matplotlib", "seaborn", "plotly", "etl", "api", "statistics", "probability", "regression",
        "classification", "clustering", "time series", "neural networks", "bayesian methods", "git", "mlops"
    ]

    description = description.lower()
    found_skills = [skill for skill in skills_list if re.search(rf"\b{re.escape(skill)}\b", description)]
    
    return found_skills

# notice we could have done something similar to get standardized job titles

In [7]:
# Apply the function to extract skills from each job description
df['extracted_skills'] = df['job_description'].apply(lambda x: extract_skills(str(x)))
df['extracted_skills'].head()

0    [python, java, sql, tensorflow, pytorch, sciki...
1    [python, java, tensorflow, pytorch, scikit-lea...
3                                                [gcp]
4                                                [gcp]
5                    [python, r, sql, etl, statistics]
Name: extracted_skills, dtype: object

In [8]:
# create a list with all the skills from the JDs
all_skills = [skill for skills in df['extracted_skills'] for skill in skills]

# count skill occurances
skill_counts = Counter(all_skills)

# Convert the skill counts to a DataFrame
df_skills = pd.DataFrame(skill_counts.items(), columns=["Skill", "Count"]).sort_values(by="Count")
df_skills.head()

Unnamed: 0,Skill,Count
31,numpy,1
29,time series,1
32,plotly,1
22,nlp,1
33,git,1


### 3) create dashboard

In [9]:
# Create the plots
bar_chart = dcc.Graph(
    id='top-roles',
    figure=px.bar(df.sort_values(by='salary_max', ascending=False), 
                  x='standardized_job_title', 
                  y='salary_max', 
                  color='company_name', 
                  title='Highest Paying AI Jobs',
                  labels={'salary_max': 'Maximum Salary', 'job_title': 'Job Title'},
                  height=500)
)

top_jobs_chart = dcc.Graph(
    id='top-jobs-chart',
    figure=px.bar(
        df_jobs[-5:],
        y='Job Title',      
        x='Mean Salary',
        title='Top 5 Roles',
        height=250
    ).update_layout(
        xaxis_title_font_size=12,    # Reduce x-axis label font size
        yaxis_title_font_size=12,    # Reduce y-axis label font size
        xaxis_tickfont_size=10,      # Reduce x-axis tick label font size
        yaxis_tickfont_size=10       # Reduce y-axis tick label font size
    )
)

top_skills_chart = dcc.Graph(
    id='top-skills-chart',
    figure=px.bar(
        df_skills[-5:],
        y='Skill',      
        x='Count',
        title='Top 5 Skills',
        height=250
    ).update_layout(
        xaxis_title_font_size=12,    # Reduce x-axis label font size
        yaxis_title_font_size=12,    # Reduce y-axis label font size
        xaxis_tickfont_size=10,      # Reduce x-axis tick label font size
        yaxis_tickfont_size=10       # Reduce y-axis tick label font size
    )
)

In [10]:
# Initialize the Dash app
app = Dash(__name__)

# App layout with styled divs for positioning
app.layout = html.Div([
    html.Div(bar_chart, style={'width': '70%', 'display': 'inline-block', 'vertical-align': 'top'}),
    html.Div([
        html.Div(top_jobs_chart, style={'height': '50%'}),
        html.Div(top_skills_chart, style={'height': '50%'})
    ], style={'width': '30%', 'display': 'inline-block', 'vertical-align': 'top'})
])

# Run the app
app.run(jupyter_mode="external")

Dash app running on http://127.0.0.1:8050/
