## Global AI Job Market & Salary Trends 2025 - Data Cleaning and Preprocessing

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

## Read the data

In [2]:
ai_data_df = r"C:\Users\LENOVO\Documents\AI_Jobs_trends\data\raw\ai_job_dataset.csv"
# Read the CSV into a DataFrame
ai_data_df = pd.read_csv(ai_data_df)
ai_data_df.head(10)

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,18/10/2024,07/11/2024,1076,5.9,Smart Analytics
1,AI00002,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,20/11/2024,11/01/2025,1268,5.2,TechCorp Inc
2,AI00003,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,18/03/2025,07/04/2025,1974,9.4,Autonomous Tech
3,AI00004,NLP Engineer,80215,USD,SE,FL,India,M,India,50,"Scala, SQL, Linux, Python",PhD,7,Consulting,23/12/2024,24/02/2025,1345,8.6,Future Systems
4,AI00005,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,"MLOps, Java, Tableau, Python",Master,0,Media,15/04/2025,23/06/2025,1989,6.6,Advanced Robotics
5,AI00006,AI Architect,123574,EUR,SE,CT,Germany,M,Germany,50,"Data Visualization, R, SQL, Linux",Associate,7,Healthcare,31/08/2024,04/10/2024,819,5.9,Neural Networks Co
6,AI00007,Principal Data Scientist,79670,GBP,MI,FL,United Kingdom,S,United Kingdom,0,"R, Docker, MLOps",Associate,3,Gaming,29/12/2024,28/02/2025,1936,6.3,DataVision Ltd
7,AI00008,NLP Engineer,70640,EUR,EN,FL,France,L,France,0,"Python, SQL, Computer Vision, Java, Azure",Master,0,Healthcare,07/06/2024,01/07/2024,1286,7.6,Cloud AI Solutions
8,AI00009,Data Analyst,160710,USD,SE,CT,Singapore,L,Singapore,0,"Hadoop, Git, Mathematics, Python",PhD,7,Government,04/11/2024,24/11/2024,551,9.3,Quantum Computing Inc
9,AI00010,AI Software Engineer,102557,USD,SE,PT,Austria,M,Austria,0,"MLOps, GCP, Scala, Azure, Linux",Master,5,Government,20/10/2024,06/11/2024,2340,5.8,Cloud AI Solutions


In [3]:
ai_data_df.shape

(15000, 19)

In [4]:
ai_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_id                  15000 non-null  object 
 1   job_title               15000 non-null  object 
 2   salary_usd              15000 non-null  int64  
 3   salary_currency         15000 non-null  object 
 4   experience_level        15000 non-null  object 
 5   employment_type         15000 non-null  object 
 6   company_location        15000 non-null  object 
 7   company_size            15000 non-null  object 
 8   employee_residence      15000 non-null  object 
 9   remote_ratio            15000 non-null  int64  
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  int64  
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [5]:
ai_data_df.duplicated().sum()

0

In [6]:
ai_data_df.isna().sum()

job_id                    0
job_title                 0
salary_usd                0
salary_currency           0
experience_level          0
employment_type           0
company_location          0
company_size              0
employee_residence        0
remote_ratio              0
required_skills           0
education_required        0
years_experience          0
industry                  0
posting_date              0
application_deadline      0
job_description_length    0
benefits_score            0
company_name              0
dtype: int64

## Standardize Categorical Variables

**Re-code the experience_level column**

In [7]:
ai_data_df['experience_level'].unique()

array(['SE', 'EN', 'MI', 'EX'], dtype=object)

In [8]:
ai_data_df.loc[:,'experience_level'] = ai_data_df['experience_level'].replace({
    'SE': 'Senior',
    'EN': 'Entry level',
    'MI': 'Mid level',
    'EX': 'Expert',
    
})

In [9]:
ai_data_df['experience_level'].unique()

array(['Senior', 'Entry level', 'Mid level', 'Expert'], dtype=object)

In [10]:
ai_data_df['employment_type'].unique()

array(['CT', 'FL', 'PT', 'FT'], dtype=object)

In [11]:
ai_data_df.loc[:,'employment_type'] = ai_data_df['employment_type'].replace({
    'CT': 'Contract',
    'FL': 'Freelance',
    'FT': 'Full time',
    'PT': 'Part time',
    
})

In [12]:
ai_data_df['employment_type'].unique()

array(['Contract', 'Freelance', 'Part time', 'Full time'], dtype=object)

In [13]:
ai_data_df['remote_ratio'].unique()

array([ 50, 100,   0], dtype=int64)

In [14]:
ai_data_df.loc[:,'remote_ratio'] = ai_data_df['remote_ratio'].replace({
     0: 'Fully on site',
     50: 'Hybrid',
     100: 'Fully remote',
    
    
})

 'Hybrid']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  ai_data_df.loc[:,'remote_ratio'] = ai_data_df['remote_ratio'].replace({


In [15]:
ai_data_df['remote_ratio'].unique()

array(['Hybrid', 'Fully remote', 'Fully on site'], dtype=object)

**Re-code salary_usd and salary_currency**

In [16]:
# Convert EUR and GBP to USD
ai_data_df['salary_usd'] = ai_data_df.apply(
    lambda row: row['salary_usd'] * 1.15 if row['salary_currency'] == 'EUR'
    else row['salary_usd'] * 1.34 if row['salary_currency'] == 'GBP'
    else row['salary_usd'],
    axis=1
)

In [17]:
ai_data_df.loc[:,'salary_currency'] = ai_data_df['salary_currency'].replace({
    'USD': 'USD',
    'EUR': 'USD',
    'GBP': 'USD',
       
})

In [18]:
ai_data_df.head(10)

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI00001,AI Research Scientist,90376.0,USD,Senior,Contract,China,M,China,Hybrid,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,18/10/2024,07/11/2024,1076,5.9,Smart Analytics
1,AI00002,AI Software Engineer,61895.0,USD,Entry level,Contract,Canada,M,Ireland,Fully remote,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,20/11/2024,11/01/2025,1268,5.2,TechCorp Inc
2,AI00003,AI Specialist,152626.0,USD,Mid level,Freelance,Switzerland,L,South Korea,Fully on site,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,18/03/2025,07/04/2025,1974,9.4,Autonomous Tech
3,AI00004,NLP Engineer,80215.0,USD,Senior,Freelance,India,M,India,Hybrid,"Scala, SQL, Linux, Python",PhD,7,Consulting,23/12/2024,24/02/2025,1345,8.6,Future Systems
4,AI00005,AI Consultant,62817.6,USD,Entry level,Part time,France,S,Singapore,Fully remote,"MLOps, Java, Tableau, Python",Master,0,Media,15/04/2025,23/06/2025,1989,6.6,Advanced Robotics
5,AI00006,AI Architect,142110.1,USD,Senior,Contract,Germany,M,Germany,Hybrid,"Data Visualization, R, SQL, Linux",Associate,7,Healthcare,31/08/2024,04/10/2024,819,5.9,Neural Networks Co
6,AI00007,Principal Data Scientist,106757.8,USD,Mid level,Freelance,United Kingdom,S,United Kingdom,Fully on site,"R, Docker, MLOps",Associate,3,Gaming,29/12/2024,28/02/2025,1936,6.3,DataVision Ltd
7,AI00008,NLP Engineer,81236.0,USD,Entry level,Freelance,France,L,France,Fully on site,"Python, SQL, Computer Vision, Java, Azure",Master,0,Healthcare,07/06/2024,01/07/2024,1286,7.6,Cloud AI Solutions
8,AI00009,Data Analyst,160710.0,USD,Senior,Contract,Singapore,L,Singapore,Fully on site,"Hadoop, Git, Mathematics, Python",PhD,7,Government,04/11/2024,24/11/2024,551,9.3,Quantum Computing Inc
9,AI00010,AI Software Engineer,102557.0,USD,Senior,Part time,Austria,M,Austria,Fully on site,"MLOps, GCP, Scala, Azure, Linux",Master,5,Government,20/10/2024,06/11/2024,2340,5.8,Cloud AI Solutions


In [19]:
ai_data_df.to_csv("ai_jobs_cleaned_dataset.csv", index = False )