In [43]:
import pandas as pd
import matplotlib.pyplot as plt

ai_data_jobs = pd.read_csv("ai_jobs_dataset.csv")
ai_data_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_id                  15000 non-null  object 
 1   job_title               15000 non-null  object 
 2   salary_usd              15000 non-null  int64  
 3   salary_currency         15000 non-null  object 
 4   experience_level        15000 non-null  object 
 5   employment_type         15000 non-null  object 
 6   company_location        15000 non-null  object 
 7   company_size            15000 non-null  object 
 8   employee_residence      15000 non-null  object 
 9   remote_ratio            15000 non-null  int64  
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  int64  
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [44]:
# Count missing values per column
print(ai_data_jobs.isnull().sum()) 

job_id                    0
job_title                 0
salary_usd                0
salary_currency           0
experience_level          0
employment_type           0
company_location          0
company_size              0
employee_residence        0
remote_ratio              0
required_skills           0
education_required        0
years_experience          0
industry                  0
posting_date              0
application_deadline      0
job_description_length    0
benefits_score            0
company_name              0
dtype: int64


In [45]:
# Find how many Data Analyst jobs are in the job_title column
data_analyst_count = ai_data_jobs[ai_data_jobs['job_title'] == 'Data Analyst'].shape[0]
print(f"Number of 'Data Analyst' roles: {data_analyst_count}")

Number of 'Data Analyst' roles: 759


In [46]:
# Define the columns to keep
columns_to_keep = [
    'job_title', 'salary_usd', 'employment_type', 'company_location', 'remote_ratio',
    'employee_residence', 'required_skills', 'education_required', 'posting_date', 'company_name'
]

# Create new ai_data_analyst_df with columns to keep
ai_data_jobs = ai_data_jobs[columns_to_keep]
print(ai_data_jobs.columns)

Index(['job_title', 'salary_usd', 'employment_type', 'company_location',
       'remote_ratio', 'employee_residence', 'required_skills',
       'education_required', 'posting_date', 'company_name'],
      dtype='object')


In [47]:
ai_data_jobs = ai_data_jobs.rename(columns={
    'remote_ratio': 'is_remote',
    'employee_residence': 'employee_location',
    'required_skills': 'job_skills',
    'education_required': 'degree_required'
})
print(ai_data_jobs.columns)

Index(['job_title', 'salary_usd', 'employment_type', 'company_location',
       'is_remote', 'employee_location', 'job_skills', 'degree_required',
       'posting_date', 'company_name'],
      dtype='object')


In [48]:
# Created new df name to highlight data analyst jobs with degree's 
degree_jobs = ai_data_jobs.copy()

In [49]:
degree_jobs['job_title'].value_counts()

job_title
Machine Learning Researcher    808
AI Software Engineer           784
Autonomous Systems Engineer    777
Machine Learning Engineer      772
AI Architect                   771
Head of AI                     765
NLP Engineer                   762
Robotics Engineer              759
Data Analyst                   759
AI Research Scientist          756
Data Engineer                  749
AI Product Manager             743
Research Scientist             742
Principal Data Scientist       734
AI Specialist                  728
ML Ops Engineer                725
Computer Vision Engineer       724
Data Scientist                 720
Deep Learning Engineer         718
AI Consultant                  704
Name: count, dtype: int64

In [50]:
degree_jobs = degree_jobs[degree_jobs['job_title'] == 'Data Analyst']
degree_jobs.head()

Unnamed: 0,job_title,salary_usd,employment_type,company_location,is_remote,employee_location,job_skills,degree_required,posting_date,company_name
8,Data Analyst,160710,CT,Singapore,0,Singapore,"Hadoop, Git, Mathematics, Python",PhD,2024-11-04,Quantum Computing Inc
27,Data Analyst,52997,PT,Austria,0,Singapore,"Mathematics, Kubernetes, TensorFlow, Tableau, ...",PhD,2025-01-10,Predictive Systems
80,Data Analyst,157597,CT,France,50,France,"Git, Kubernetes, TensorFlow, R, Java",Bachelor,2024-08-31,Neural Networks Co
91,Data Analyst,97997,CT,France,50,France,"Hadoop, Python, TensorFlow, R, Statistics",PhD,2024-06-19,Algorithmic Solutions
96,Data Analyst,109029,FT,France,0,France,"Kubernetes, Scala, Java",Bachelor,2025-04-05,Predictive Systems


In [51]:
degree_jobs['job_skills'].value_counts()

job_skills
Python, TensorFlow, Data Visualization    3
Python, TensorFlow, PyTorch               2
Linux, AWS, Azure                         2
Linux, Python, NLP                        2
Linux, SQL, Python, Hadoop                2
                                         ..
TensorFlow, Azure, SQL, Tableau           1
Python, Java, Tableau, Deep Learning      1
Spark, Linux, Docker                      1
R, Tableau, Git, Python, Java             1
Python, Kubernetes, Scala, Linux          1
Name: count, Length: 753, dtype: int64

In [52]:
degree_jobs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 759 entries, 8 to 14982
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   job_title          759 non-null    object
 1   salary_usd         759 non-null    int64 
 2   employment_type    759 non-null    object
 3   company_location   759 non-null    object
 4   is_remote          759 non-null    int64 
 5   employee_location  759 non-null    object
 6   job_skills         759 non-null    object
 7   degree_required    759 non-null    object
 8   posting_date       759 non-null    object
 9   company_name       759 non-null    object
dtypes: int64(2), object(8)
memory usage: 65.2+ KB


In [53]:
# Convert job_skills in degree_jobs to a list

degree_jobs['job_skills'] = degree_jobs['job_skills'].apply(lambda x: [skill.strip() for skill in x.split(',')])

In [54]:
target_skills = {'Python', 'SQL', 'Tableau'}

def has_target_skills(skills):
    return any(skill in target_skills for skill in skills)

degree_jobs['has_target'] = degree_jobs['job_skills'].apply(has_target_skills)
print(degree_jobs.columns)

Index(['job_title', 'salary_usd', 'employment_type', 'company_location',
       'is_remote', 'employee_location', 'job_skills', 'degree_required',
       'posting_date', 'company_name', 'has_target'],
      dtype='object')


In [55]:
print(degree_jobs['has_target'].value_counts())

has_target
True     436
False    323
Name: count, dtype: int64
