In [2]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import pandas as pd
import os
from dotenv import load_dotenv

print("Existent directory:", os.getcwd())

print("\nFiles in this folder:")
print(os.listdir())

from pathlib import Path
from dotenv import load_dotenv

load_dotenv(dotenv_path='.env')

APP_ID = os.getenv("ADZUNA_APP_ID")
APP_KEY = os.getenv("ADZUNA_APP_KEY")


url_2 = "https://api.adzuna.com/v1/api/jobs/gb/search/{}"

jobs = []

for page in range(1, 21):
    url = url_2.format(page)  # até 1000 resultados
    params = {
        "app_id": APP_ID,
        "app_key": APP_KEY,
        "results_per_page": 50,
        "what": "data analyst",
        "sort_by": "date"
    }

    response = requests.get(url, params=params)
    
    # 💡 debug opcional
    print(f"Page {page} | Status: {response.status_code}")
    
    try:
        data = response.json()
    except Exception as e:
        print(f"Erro ao processar JSON na página {page}: {e}")
        print("Resposta:", response.text[:500])
        break

    results = data.get("results", [])
    if not results:
        print("Sem mais resultados.")
        break

    for job in results:
        desc = job.get("description", "").lower()
        loc = job.get("location", {}).get("display_name", "").lower()

        if "remote" in desc or "remote" in loc:
            job_type = "Remote"
        elif "hybrid" is desc:
            job_type = "Hybrid"
        else:
            job_type = "Onsite"


        jobs.append({
            "Date" : job.get("created"),
            "Title": job.get("title"),
            "Company": job.get("company", {}).get("display_name"),
            "Location": job.get("location", {}).get("display_name"),
            "Salary_Min": job.get("salary_min"),
            "Salary_Max": job.get("salary_max"),
            "Salary_Avg": job.get("salary_max") or job.get("salary_min"),
            "Salary_Is_Predicted": job.get("salary_is_predicted"),
            "Category": job.get("category", {}).get("label"),
            "Description": job.get("description"),
            "Word_Mode": job_type,
            "URL": job.get("redirect_url")
        })

adzuna2_df = pd.DataFrame(jobs)
adzuna2_df.to_csv("adzuna2_jobs.csv", index=False)
print(adzuna2_df.head())



Existent directory: c:\Users\sofia\Documents\Data_Analytics_Ironhack\Projects\the-professor-project-1-ironhack

Files in this folder:
['.env', '.git', '.gitignore', 'adzuna2_jobs.csv', 'adzuna_find.py', 'adzuna_international_jobs.csv', 'adzuna_jobs.csv', 'data_analyst_international_jobs.csv', 'find_jobs.ipynb', 'jobicy.py', 'jobicy_jobs.csv', 'jobs.ipynb', 'kaggle.ipynb', 'kaggle_related_skills.csv', 'README.md', 'remoteok_jobs.csv', 'Remote_Tech_Job_Salaries.ipynb']


  elif "hybrid" is desc:


Page 1 | Status: 200
Page 2 | Status: 200
Page 3 | Status: 200
Page 4 | Status: 200
Page 5 | Status: 200
Page 6 | Status: 200
Page 7 | Status: 200
Page 8 | Status: 200
Page 9 | Status: 200
Page 10 | Status: 200
Page 11 | Status: 200
Page 12 | Status: 200
Page 13 | Status: 200
Page 14 | Status: 200
Page 15 | Status: 200
Page 16 | Status: 200
Page 17 | Status: 200
Page 18 | Status: 200
Page 19 | Status: 200
Page 20 | Status: 200
                   Date                     Title  \
0  2025-05-20T20:09:50Z       Team Lead (Finance)   
1  2025-05-20T20:09:11Z              Data Analyst   
2  2025-05-20T19:49:22Z       Junior Data Analyst   
3  2025-05-20T19:49:22Z        Asset Data Analyst   
4  2025-05-20T17:50:42Z  Healthcare Analyst (FTC)   

                            Company                        Location  \
0                         Bloomberg                      London, UK   
1                Corecom Consulting           Leeds, West Yorkshire   
2          Vehicle Data Global Ltd.  

In [4]:
adzuna2_df.head()

Unnamed: 0,Date,Title,Company,Location,Salary_Min,Salary_Max,Salary_Avg,Salary_Is_Predicted,Category,Description,Word_Mode,URL
0,2025-05-20T20:09:50Z,Team Lead (Finance),Bloomberg,"London, UK",52996.15,52996.15,52996.15,1,IT Jobs,Bloomberg runs on data. Our products are fuele...,Onsite,https://www.adzuna.co.uk/jobs/land/ad/52080436...
1,2025-05-20T20:09:11Z,Data Analyst,Corecom Consulting,"Leeds, West Yorkshire",50000.0,50000.0,50000.0,0,IT Jobs,"Data Analyst - Up to £50,000 Hybrid Working (...",Onsite,https://www.adzuna.co.uk/jobs/land/ad/52080431...
2,2025-05-20T19:49:22Z,Junior Data Analyst,Vehicle Data Global Ltd.,"Chesterfield, Derbyshire",24000.0,24000.0,24000.0,0,IT Jobs,"Junior Data Analyst / Starting salary of £24,0...",Onsite,https://www.adzuna.co.uk/jobs/land/ad/52080270...
3,2025-05-20T19:49:22Z,Asset Data Analyst,Great Places Housing Association,"Chorlton Cum Hardy, Manchester",35000.0,35000.0,35000.0,0,IT Jobs,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite,https://www.adzuna.co.uk/jobs/land/ad/52080270...
4,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GlobalData PLC,"Fleet Street, Central London",43779.34,43779.34,43779.34,1,IT Jobs,Who we are… GlobalData is a specialist informa...,Onsite,https://www.adzuna.co.uk/jobs/details/52076740...


In [5]:
adzuna2_df.shape

(1000, 12)

In [6]:
adzuna2_df.columns

Index(['Date', 'Title', 'Company', 'Location', 'Salary_Min', 'Salary_Max',
       'Salary_Avg', 'Salary_Is_Predicted', 'Category', 'Description',
       'Word_Mode', 'URL'],
      dtype='object')

In [7]:
adzuna2_df.describe()

Unnamed: 0,Salary_Min,Salary_Max,Salary_Avg
count,1000.0,1000.0,1000.0
mean,46145.57552,49753.18452,49753.18452
std,20324.637833,21887.256836,21887.256836
min,0.0,56.0,56.0
25%,35000.0,36969.415,36969.415
50%,44399.94,45966.545,45966.545
75%,53606.18,55792.6825,55792.6825
max,208000.0,221000.0,221000.0


In [8]:
adzuna2_df["Location"].value_counts()

Location
London, UK                        245
UK                                132
Bristol, South West England        22
Leeds, West Yorkshire              21
Manchester, Greater Manchester     21
                                 ... 
Blackpool Airport, Blackpool        1
Harmondsworth, West Drayton         1
Coventry, West Midlands             1
Hartlepool, County Durham           1
Covent Garden, Central London       1
Name: count, Length: 307, dtype: int64

In [9]:
adzuna2_df["Word_Mode"].value_counts()

Word_Mode
Onsite    908
Remote     92
Name: count, dtype: int64

In [10]:
load_dotenv(dotenv_path=Path().resolve().parent.parent / ".env")

APP_ID = os.getenv("ADZUNA_APP_ID")
APP_KEY = os.getenv("ADZUNA_APP_KEY")

if not APP_ID or not APP_KEY:
    raise ValueError("Missing API credentials in .env")

countries = ["gb", "nl", "de", "fr", "us"]
all_jobs = []

for country in countries:
    for page in range(1, 6):
        url = f"https://api.adzuna.com/v1/api/jobs/{country}/search/{page}"
        params = {
            "app_id": APP_ID,
            "app_key": APP_KEY,
            "results_per_page": 50,
            "what": "",
            "sort_by": "date"
        }

        response = requests.get(url, params=params)
        print(f"{country.upper()} | Page {page} | Status: {response.status_code}")

        try:
            data = response.json()
        except Exception as e:
            print(f"Erro ao processar JSON na página {page} ({country}): {e}")
            print("Resposta:", response.text[:300])
            break

        results = data.get("results", [])
        if not results:
            print("Sem mais resultados.")
            break

        for job in results:
            desc = job.get("description", "").lower()
            loc = job.get("location", {}).get("display_name", "").lower()

            if "remote" in desc or "remote" in loc:
                job_type = "Remote"
            elif "hybrid" in desc:
                job_type = "Hybrid"
            else:
                job_type = "Onsite"

            all_jobs.append({
                "Country": country.upper(),
                "Title": job.get("title"),
                "Company": job.get("company", {}).get("display_name"),
                "Location": job.get("location", {}).get("display_name"),
                "Salary_Min": job.get("salary_min"),
                "Salary_Max": job.get("salary_max"),
                "Salary_Avg": job.get("salary_max") or job.get("salary_min"),
                "Salary_Is_Predicted": job.get("salary_is_predicted"),
                "Category": job.get("category", {}).get("label"),
                "Description": job.get("description"),
                "Created_At": job.get("created"),
                "Work_Mode": job_type,
                "URL": job.get("redirect_url")
            })

countries_df = pd.DataFrame(all_jobs)
countries_df.to_csv("adzuna_international_jobs.csv", index=False)
print(f"✅ Total de {len(countries_df)} vagas guardadas em adzuna_international_jobs.csv")

GB | Page 1 | Status: 200
GB | Page 2 | Status: 200
GB | Page 3 | Status: 200
GB | Page 4 | Status: 200
GB | Page 5 | Status: 200
NL | Page 1 | Status: 200
NL | Page 2 | Status: 200
NL | Page 3 | Status: 200
NL | Page 4 | Status: 200
NL | Page 5 | Status: 200
DE | Page 1 | Status: 200
DE | Page 2 | Status: 200
DE | Page 3 | Status: 200
DE | Page 4 | Status: 200
DE | Page 5 | Status: 200
FR | Page 1 | Status: 200
FR | Page 2 | Status: 200
FR | Page 3 | Status: 200
FR | Page 4 | Status: 200
FR | Page 5 | Status: 200
US | Page 1 | Status: 200
US | Page 2 | Status: 200
US | Page 3 | Status: 200
US | Page 4 | Status: 200
US | Page 5 | Status: 200
✅ Total de 1250 vagas guardadas em adzuna_international_jobs.csv


In [24]:
countries_df.head()

Unnamed: 0,creation_date,job_title,company,location,country,category,salary_min,salary_max,description,work_mode,skills
0,2025-05-20T20:14:46Z,MS SQL DBA,Leidos,"St. James, Bristol",GB,IT Jobs,49839.19,49839.19,Description MS SQL DBA Programme Name: LCST Lo...,Hybrid,"r, sql"
1,2025-05-20T20:14:43Z,2nd Line Infrastructure Engineer,Leidos,"Abbots Ripton, Huntingdon",GB,IT Jobs,38444.16,38444.16,Description 2nd Line Infrastructure Engineer L...,Onsite,r
2,2025-05-20T20:14:43Z,2nd Line Infrastructure Engineer,Leidos,"Abbots Ripton, Huntingdon",GB,IT Jobs,38444.16,38444.16,Description 2nd Line Infrastructure Engineer L...,Onsite,r
3,2025-05-20T20:14:32Z,English Teacher,Wayman Education,"Oaklands, St. Albans",GB,Teaching Jobs,31650.0,49084.0,A wonderful opportunity is open for a dynamic ...,Onsite,r
4,2025-05-20T20:14:31Z,Teaching Assistant,Randstad Education,"Cuckfield, Haywards Heath",GB,Teaching Jobs,22100.0,23400.0,Primary Teaching Assistant - Randstad Educatio...,Onsite,r


In [26]:
work_type = countries_df["work_mode"].value_counts()
work_type

work_mode
Onsite    1216
Hybrid      27
Remote       7
Name: count, dtype: int64

In [27]:
countries_df.describe()

Unnamed: 0,salary_min,salary_max
count,651.0,647.0
mean,51165.679447,55563.580093
std,39032.60975,37428.907442
min,0.0,17.0
25%,30126.785,34200.0
50%,37536.16,41600.0
75%,59664.605,65000.0
max,171689.02,171689.02


In [29]:
countries_count = countries_df["country"].value_counts()
countries_count

country
GB    250
NL    250
DE    250
FR    250
US    250
Name: count, dtype: int64

In [30]:
countries_df.nunique()

creation_date     345
job_title         916
company           439
location          839
country             5
category           57
salary_min        434
salary_max        452
description      1016
work_mode           3
skills              7
dtype: int64

In [31]:
null_values = countries_df.isnull().sum()
null_values

creation_date      0
job_title          0
company            0
location           0
country            0
category           0
salary_min       599
salary_max       603
description        0
work_mode          0
skills             0
dtype: int64

In [17]:
countries_df = countries_df.rename(columns={
    "Country": "country",
    "Title" : "job_title",
    "Company" : "company",
    "Location" : "location",
    "Salary_Min" : "salary_min",
    "Salary_Max" : "salary_max",
    "Category" : "category",
    "Description" : "description",
    "Created_At" : "creation_date",
    "Work_Mode" : "work_mode"
})

countries_df = countries_df.drop(columns=["Salary_Is_Predicted", "URL"], errors="ignore")

countries_df = countries_df[[
    "creation_date","job_title", "company", "location", "country","category", "salary_min", "salary_max", "description", "work_mode" 
]]

In [18]:
countries_df.head()

Unnamed: 0,creation_date,job_title,company,location,country,category,salary_min,salary_max,description,work_mode
0,2025-05-20T20:14:46Z,MS SQL DBA,Leidos,"St. James, Bristol",GB,IT Jobs,49839.19,49839.19,Description MS SQL DBA Programme Name: LCST Lo...,Hybrid
1,2025-05-20T20:14:43Z,2nd Line Infrastructure Engineer,Leidos,"Abbots Ripton, Huntingdon",GB,IT Jobs,38444.16,38444.16,Description 2nd Line Infrastructure Engineer L...,Onsite
2,2025-05-20T20:14:43Z,2nd Line Infrastructure Engineer,Leidos,"Abbots Ripton, Huntingdon",GB,IT Jobs,38444.16,38444.16,Description 2nd Line Infrastructure Engineer L...,Onsite
3,2025-05-20T20:14:32Z,English Teacher,Wayman Education,"Oaklands, St. Albans",GB,Teaching Jobs,31650.0,49084.0,A wonderful opportunity is open for a dynamic ...,Onsite
4,2025-05-20T20:14:31Z,Teaching Assistant,Randstad Education,"Cuckfield, Haywards Heath",GB,Teaching Jobs,22100.0,23400.0,Primary Teaching Assistant - Randstad Educatio...,Onsite


In [19]:
skills_keywords = [
    "python", "sql", "excel", "power bi", "tableau",
    "r", "spark", "aws", "azure", "machine learning",
    "statistics", "pandas", "numpy", "scikit-learn"
]

def extract_skills(descr, skills_list):
    if not isinstance(descr, str):
        return ""
    
    descr_lower = descr.lower()
    found = [skill for skill in skills_list if skill in descr_lower]
    return ", ".join(sorted(set(found)))


In [32]:
countries_df["skills"] = countries_df["description"].apply(lambda x: extract_skills(x, skills_keywords))

countries_df.head()

Unnamed: 0,creation_date,job_title,company,location,country,category,salary_min,salary_max,description,work_mode,skills
0,2025-05-20T20:14:46Z,MS SQL DBA,Leidos,"St. James, Bristol",GB,IT Jobs,49839.19,49839.19,Description MS SQL DBA Programme Name: LCST Lo...,Hybrid,"r, sql"
1,2025-05-20T20:14:43Z,2nd Line Infrastructure Engineer,Leidos,"Abbots Ripton, Huntingdon",GB,IT Jobs,38444.16,38444.16,Description 2nd Line Infrastructure Engineer L...,Onsite,r
2,2025-05-20T20:14:43Z,2nd Line Infrastructure Engineer,Leidos,"Abbots Ripton, Huntingdon",GB,IT Jobs,38444.16,38444.16,Description 2nd Line Infrastructure Engineer L...,Onsite,r
3,2025-05-20T20:14:32Z,English Teacher,Wayman Education,"Oaklands, St. Albans",GB,Teaching Jobs,31650.0,49084.0,A wonderful opportunity is open for a dynamic ...,Onsite,r
4,2025-05-20T20:14:31Z,Teaching Assistant,Randstad Education,"Cuckfield, Haywards Heath",GB,Teaching Jobs,22100.0,23400.0,Primary Teaching Assistant - Randstad Educatio...,Onsite,r


In [21]:
countries_df["skills"].value_counts()

skills
r                                  1214
excel, r                             29
r, sql                                2
aws, r                                2
excel, power bi, python, r, sql       1
aws, python, r                        1
azure, r                              1
Name: count, dtype: int64

In [33]:
APP_ID = os.getenv("ADZUNA_APP_ID")
APP_KEY = os.getenv("ADZUNA_APP_KEY")

if not APP_ID or not APP_KEY:
    raise ValueError("Missing API credentials in .env")

countries = ["gb", "nl", "de", "fr", "us"]
all_jobs = []

for country in countries:
    for page in range(1, 6):
        url = f"https://api.adzuna.com/v1/api/jobs/{country}/search/{page}"
        params = {
            "app_id": APP_ID,
            "app_key": APP_KEY,
            "results_per_page": 50,
            "what": "data analyst",
            "sort_by": "date"
        }

        response = requests.get(url, params=params)
        print(f"{country.upper()} | Page {page} | Status: {response.status_code}")

        try:
            data = response.json()
        except Exception as e:
            print(f"Error processing JSON on the page {page} ({country}): {e}")
            print("Response:", response.text[:300])
            break

        results = data.get("results", [])
        if not results:
            print("No more results.")
            break

        for job in results:
            desc = job.get("description", "").lower()
            loc = job.get("location", {}).get("display_name", "").lower()

            if "remote" in desc or "remote" in loc:
                job_type = "Remote"
            elif "hybrid" in desc:
                job_type = "Hybrid"
            else:
                job_type = "Onsite"

            all_jobs.append({
                "Country": country.upper(),
                "Title": job.get("title"),
                "Company": job.get("company", {}).get("display_name"),
                "Location": job.get("location", {}).get("display_name"),
                "Salary_Min": job.get("salary_min"),
                "Salary_Max": job.get("salary_max"),
                "Salary_period" : job.get("salary_period"),
                "Salary_Avg": job.get("salary_max") or job.get("salary_min"),
                "Salary_Is_Predicted": job.get("salary_is_predicted"),
                "Category": job.get("category", {}).get("label"),
                "Description": job.get("description"),
                "Created_At": job.get("created"),
                "Work_Mode": job_type,
                "URL": job.get("redirect_url")
            })

dataanalyst_df = pd.DataFrame(all_jobs)
dataanalyst_df.to_csv("data_analyst_international_jobs.csv", index=False)
print(f"✅ Total of {len(countries_df)} jobs saved in data_analyst_international_jobs.csv")

GB | Page 1 | Status: 200
GB | Page 2 | Status: 200
GB | Page 3 | Status: 200
GB | Page 4 | Status: 200
GB | Page 5 | Status: 200
NL | Page 1 | Status: 200
NL | Page 2 | Status: 200
NL | Page 3 | Status: 200
NL | Page 4 | Status: 200
NL | Page 5 | Status: 200
DE | Page 1 | Status: 200
DE | Page 2 | Status: 200
DE | Page 3 | Status: 200
DE | Page 4 | Status: 200
DE | Page 5 | Status: 200
FR | Page 1 | Status: 200
FR | Page 2 | Status: 200
FR | Page 3 | Status: 200
FR | Page 4 | Status: 200
FR | Page 5 | Status: 200
US | Page 1 | Status: 200
US | Page 2 | Status: 200
US | Page 3 | Status: 200
US | Page 4 | Status: 200
US | Page 5 | Status: 200
✅ Total of 1250 jobs saved in data_analyst_international_jobs.csv


In [34]:
dataanalyst_df.head()

Unnamed: 0,Country,Title,Company,Location,Salary_Min,Salary_Max,Salary_period,Salary_Avg,Salary_Is_Predicted,Category,Description,Created_At,Work_Mode,URL
0,GB,Team Lead (Finance),Bloomberg,"London, UK",52996.15,52996.15,,52996.15,1,IT Jobs,Bloomberg runs on data. Our products are fuele...,2025-05-20T20:09:50Z,Onsite,https://www.adzuna.co.uk/jobs/land/ad/52080436...
1,GB,Data Analyst,Corecom Consulting,"Leeds, West Yorkshire",50000.0,50000.0,,50000.0,0,IT Jobs,"Data Analyst - Up to £50,000 Hybrid Working (...",2025-05-20T20:09:11Z,Hybrid,https://www.adzuna.co.uk/jobs/land/ad/52080431...
2,GB,Junior Data Analyst,Vehicle Data Global Ltd.,"Chesterfield, Derbyshire",24000.0,24000.0,,24000.0,0,IT Jobs,"Junior Data Analyst / Starting salary of £24,0...",2025-05-20T19:49:22Z,Onsite,https://www.adzuna.co.uk/jobs/land/ad/52080270...
3,GB,Asset Data Analyst,Great Places Housing Association,"Chorlton Cum Hardy, Manchester",35000.0,35000.0,,35000.0,0,IT Jobs,"Asset Data Analyst SalaryUp to £39,248 Locatio...",2025-05-20T19:49:22Z,Onsite,https://www.adzuna.co.uk/jobs/land/ad/52080270...
4,GB,Healthcare Analyst (FTC),GlobalData PLC,"Fleet Street, Central London",43779.34,43779.34,,43779.34,1,IT Jobs,Who we are… GlobalData is a specialist informa...,2025-05-20T17:50:42Z,Onsite,https://www.adzuna.co.uk/jobs/details/52076740...


In [35]:
dataanalyst_df.shape

(1250, 14)

In [36]:
null_values = dataanalyst_df.isnull().sum()
null_values

Country                   0
Title                     0
Company                  10
Location                  0
Salary_Min              661
Salary_Max              661
Salary_period          1250
Salary_Avg              661
Salary_Is_Predicted       0
Category                  0
Description               0
Created_At                0
Work_Mode                 0
URL                       0
dtype: int64

In [37]:
dataanalyst_columns2_df = dataanalyst_df.rename(columns={
    "Country": "country",
    "Title" : "job_title",
    "Salary_Min" : "salary_min",
    "Salary_Max" : "salary_max",
    "Salary_Is_Predicted" : "salary_predicted",
    "Category" : "category",
    "Description" : "description",
    "Created_At" : "creation_date",
    "Work_Mode" : "work_mode"
})

dataanalyst_columns2_df = dataanalyst_columns2_df.drop(columns=["Salary_Avg", "Location","Company", "URL"], errors="ignore")

dataanalyst_columns2_df = dataanalyst_columns2_df[[
    "creation_date","job_title", "country","category", "salary_min", "salary_max", "salary_predicted", "description", "work_mode" 
]]

In [38]:
dataanalyst_columns2_df.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode
0,2025-05-20T20:09:50Z,Team Lead (Finance),GB,IT Jobs,52996.15,52996.15,1,Bloomberg runs on data. Our products are fuele...,Onsite
1,2025-05-20T20:09:11Z,Data Analyst,GB,IT Jobs,50000.0,50000.0,0,"Data Analyst - Up to £50,000 Hybrid Working (...",Hybrid
2,2025-05-20T19:49:22Z,Junior Data Analyst,GB,IT Jobs,24000.0,24000.0,0,"Junior Data Analyst / Starting salary of £24,0...",Onsite
3,2025-05-20T19:49:22Z,Asset Data Analyst,GB,IT Jobs,35000.0,35000.0,0,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite
4,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite


In [None]:
# dataanalyst_df.head()

Unnamed: 0,creation_date,job_title,company,location,country,category,salary_min,salary_max,description,work_mode
0,2025-05-20T16:50:20Z,Asset Data Analyst,Great Places Housing Association,"Chorlton Cum Hardy, Manchester",GB,IT Jobs,0.0,39248.0,"Asset Data Analyst (full time, 35 hours per we...",Onsite
1,2025-05-20T16:47:51Z,Senior Data Analyst,Hippo Digital,UK,GB,IT Jobs,0.0,59082.0,About The Role Hippo is recruiting for a Senio...,Onsite
2,2025-05-20T16:47:33Z,Data Engineer,Somerset Bridge Group,"Newcastle Upon Tyne, Tyne & Wear",GB,IT Jobs,64864.34,64864.34,Location - Newcastle Employment Type - Full Ti...,Onsite
3,2025-05-20T16:34:28Z,Remote Travel Business Development / Sales Coa...,RecruitmentRevolution.com,"Rusholme, Manchester",GB,Sales Jobs,35000.0,35000.0,Want to make a difference? Want to be a part o...,Onsite
4,2025-05-20T16:34:11Z,Remote Travel Business Development / Sales Coa...,RecruitmentRevolution.com,"Bournemouth, Dorset",GB,Sales Jobs,35000.0,35000.0,Want to make a difference? Want to be a part o...,Onsite


In [39]:
dataanalyst_columns2_df.shape

(1250, 9)

In [40]:
skills_keywords = [
    "python", "sql", "excel", "power bi", "tableau",
    "r", "spark", "aws", "azure", "machine learning",
    "statistics", "pandas", "numpy", "scikit-learn"
]

def extract_skills(descr, skills_list):
    if not isinstance(descr, str):
        return ""
    
    descr_lower = descr.lower()
    found = [skill for skill in skills_list if skill in descr_lower]
    return ", ".join(sorted(set(found)))


dataanalyst_columns2_df["skills"] = dataanalyst_columns2_df["description"].apply(lambda x: extract_skills(x, skills_keywords))

dataanalyst_columns2_df.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills
0,2025-05-20T20:09:50Z,Team Lead (Finance),GB,IT Jobs,52996.15,52996.15,1,Bloomberg runs on data. Our products are fuele...,Onsite,r
1,2025-05-20T20:09:11Z,Data Analyst,GB,IT Jobs,50000.0,50000.0,0,"Data Analyst - Up to £50,000 Hybrid Working (...",Hybrid,"excel, power bi, python, r, sql"
2,2025-05-20T19:49:22Z,Junior Data Analyst,GB,IT Jobs,24000.0,24000.0,0,"Junior Data Analyst / Starting salary of £24,0...",Onsite,r
3,2025-05-20T19:49:22Z,Asset Data Analyst,GB,IT Jobs,35000.0,35000.0,0,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite,r
4,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r


In [41]:
dataanalyst_columns2_df["skills"].value_counts()

skills
r                                         1058
r, sql                                      44
excel, r                                    43
r, tableau                                  17
azure, r                                    13
machine learning, r                         12
power bi, r                                  9
power bi, r, tableau                         7
aws, r                                       6
r, spark                                     5
python, r, sql                               5
r, sql, tableau                              4
power bi, r, sql                             3
excel, power bi, python, r, sql              2
power bi, python, r                          2
excel, power bi, r                           2
aws, azure, r, spark                         1
excel, python, r, sql                        1
azure, power bi, python, r, spark, sql       1
power bi, r, sql, tableau                    1
excel, r, tableau                            1
r, sta

In [42]:
dataanalyst_columns2_df.describe()

Unnamed: 0,salary_min,salary_max
count,589.0,589.0
mean,66452.454618,70379.928302
std,39707.586474,41068.864531
min,0.0,74.0
25%,38034.08,42000.0
50%,56567.7,60353.24
75%,89268.21,91365.1
max,323403.78,323403.78


In [43]:
unique_val = dataanalyst_columns2_df["salary_min"].nunique()
unique_val

458

In [44]:
dupli = dataanalyst_columns2_df.duplicated().sum()
dupli

43

In [45]:
dataanalyst_columns2_df.shape

(1250, 10)

In [None]:
#dataanalyst_columns_df["salary_avg"] = dataanalyst_columns_df[["salary_min", "salary_max"]].mean(axis=1)

#dataanalyst_columns_df.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills,salary_avg
0,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r,43779.34
1,2025-05-20T17:40:06Z,DATA ANALYST,GB,Teaching Jobs,25904.19,25904.19,1,Job Purpose To play a key role in the developm...,Onsite,r,25904.19
2,2025-05-20T17:40:06Z,Data Protection Analyst,GB,IT Jobs,47134.89,47134.89,1,Join our growing business in this brand new po...,Onsite,r,47134.89
3,2025-05-20T17:25:48Z,Remote Travel Business Development / Sales Coa...,GB,Sales Jobs,35000.0,35000.0,0,Want to make a difference?Want to be a part of...,Onsite,r,35000.0
4,2025-05-20T17:25:48Z,Remote Travel Business Development / Sales Coa...,GB,Sales Jobs,35000.0,35000.0,0,Want to make a difference?Want to be a part of...,Onsite,r,35000.0


In [None]:
#print(dataanalyst_columns_df.shape)
#dataanalyst_columns_df.describe()

(1250, 11)


Unnamed: 0,salary_min,salary_max,salary_avg
count,588.0,588.0,588.0
mean,66459.552364,70388.120391,68423.836378
std,39710.111314,41075.397928,39821.742104
min,0.0,74.0,68.5
25%,38181.995,42101.55,40000.0
50%,56608.565,60380.965,58970.735
75%,88900.2975,91255.1425,90136.635
max,323403.78,323403.78,323403.78


In [None]:
# dataanalyst_backup = dataanalyst_columns_df.copy()

In [46]:
dataanalyst_backup2 = dataanalyst_columns2_df.copy()

In [47]:
dataanalyst_backup2.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills
0,2025-05-20T20:09:50Z,Team Lead (Finance),GB,IT Jobs,52996.15,52996.15,1,Bloomberg runs on data. Our products are fuele...,Onsite,r
1,2025-05-20T20:09:11Z,Data Analyst,GB,IT Jobs,50000.0,50000.0,0,"Data Analyst - Up to £50,000 Hybrid Working (...",Hybrid,"excel, power bi, python, r, sql"
2,2025-05-20T19:49:22Z,Junior Data Analyst,GB,IT Jobs,24000.0,24000.0,0,"Junior Data Analyst / Starting salary of £24,0...",Onsite,r
3,2025-05-20T19:49:22Z,Asset Data Analyst,GB,IT Jobs,35000.0,35000.0,0,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite,r
4,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r


In [None]:
#dataanalyst_columns_df = dataanalyst_columns_df.drop(columns=["salary_avg"], errors="ignore")
#dataanalyst_columns_df.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills
0,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r
1,2025-05-20T17:40:06Z,DATA ANALYST,GB,Teaching Jobs,25904.19,25904.19,1,Job Purpose To play a key role in the developm...,Onsite,r
2,2025-05-20T17:40:06Z,Data Protection Analyst,GB,IT Jobs,47134.89,47134.89,1,Join our growing business in this brand new po...,Onsite,r
3,2025-05-20T17:25:48Z,Remote Travel Business Development / Sales Coa...,GB,Sales Jobs,35000.0,35000.0,0,Want to make a difference?Want to be a part of...,Onsite,r
4,2025-05-20T17:25:48Z,Remote Travel Business Development / Sales Coa...,GB,Sales Jobs,35000.0,35000.0,0,Want to make a difference?Want to be a part of...,Onsite,r


In [None]:
#def adjust_salary(value): # function to adjust the salary, as some of them seem to be hourly salaries
  #  try: 
   #     value = float(value)
   # except (TypeError, ValueError):
  #      return None
  #  
  #  if value <= 0:
 #       return None
 #   if value < 100:
 #       return value * 40 * 52
  #  elif value < 1000:
  #      return value * 5 * 52
  #  elif value < 10000:
  #      return value * 12
  #  else:
  #      return value

In [None]:
#dataanalyst_columns_df["salary_min_raw"] = dataanalyst_columns_df["salary_min"]
#dataanalyst_columns_df["salary_max_raw"] = dataanalyst_columns_df["salary_max"]

In [None]:
#dataanalyst_columns_df["salary_min"] = dataanalyst_columns_df["salary_min_raw"].apply(adjust_salary)
#dataanalyst_columns_df["salary_max"] = dataanalyst_columns_df["salary_max_raw"].apply(adjust_salary)



In [None]:
#dataanalyst_columns_df["salary_min"] = dataanalyst_columns_df["salary_min"].replace(0, pd.NA)
#dataanalyst_columns_df["salary_max"] = dataanalyst_columns_df["salary_max"].replace(0, pd.NA)

In [None]:
#dataanalyst_columns_df["salary_avg"] = dataanalyst_columns_df[["salary_min", "salary_max"]].mean(axis=1)

In [None]:
# dataanalyst_columns_df.describe()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills,salary_min_raw,salary_max_raw,salary_avg
count,1250,1250,1250,1250,0.0,0.0,1250,1250,1250,1250,0.0,0.0,0.0
unique,1055,802,5,65,0.0,0.0,2,965,3,34,0.0,0.0,0.0
top,2025-05-20T09:44:54Z,Lead Data Reporting Analyst,GB,IT Jobs,,,0,Company : HM Insurance Group Job Description :...,Onsite,r,,,
freq,21,38,250,357,,,866,38,1130,1060,,,


In [None]:
#dataanalyst_columns_df.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills,salary_min_raw,salary_max_raw,salary_avg
0,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,,,1,Who we are… GlobalData is a specialist informa...,Onsite,r,43779.34,43779.34,
1,2025-05-20T17:40:06Z,DATA ANALYST,GB,Teaching Jobs,,,1,Job Purpose To play a key role in the developm...,Onsite,r,25904.19,25904.19,
2,2025-05-20T17:40:06Z,Data Protection Analyst,GB,IT Jobs,,,1,Join our growing business in this brand new po...,Onsite,r,47134.89,47134.89,
3,2025-05-20T17:25:48Z,Remote Travel Business Development / Sales Coa...,GB,Sales Jobs,,,0,Want to make a difference?Want to be a part of...,Onsite,r,35000.0,35000.0,
4,2025-05-20T17:25:48Z,Remote Travel Business Development / Sales Coa...,GB,Sales Jobs,,,0,Want to make a difference?Want to be a part of...,Onsite,r,35000.0,35000.0,


In [48]:
print(dataanalyst_backup2[["salary_min", "salary_max"]].head())

   salary_min  salary_max
0    52996.15    52996.15
1    50000.00    50000.00
2    24000.00    24000.00
3    35000.00    35000.00
4    43779.34    43779.34


In [49]:
dataanalyst_safe = dataanalyst_backup2.copy()

In [50]:
dataanalyst_safe.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills
0,2025-05-20T20:09:50Z,Team Lead (Finance),GB,IT Jobs,52996.15,52996.15,1,Bloomberg runs on data. Our products are fuele...,Onsite,r
1,2025-05-20T20:09:11Z,Data Analyst,GB,IT Jobs,50000.0,50000.0,0,"Data Analyst - Up to £50,000 Hybrid Working (...",Hybrid,"excel, power bi, python, r, sql"
2,2025-05-20T19:49:22Z,Junior Data Analyst,GB,IT Jobs,24000.0,24000.0,0,"Junior Data Analyst / Starting salary of £24,0...",Onsite,r
3,2025-05-20T19:49:22Z,Asset Data Analyst,GB,IT Jobs,35000.0,35000.0,0,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite,r
4,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r


In [51]:
dataanalyst_safe["salary_min_raw"] = dataanalyst_safe["salary_min"]
dataanalyst_safe["salary_max_raw"] = dataanalyst_safe["salary_max"]


In [52]:
def adjust_salary(value): # function to adjust the salary, as some of them seem to be hourly salaries
    try: 
        value = float(value)
    except (TypeError, ValueError):
        return None
    
    if value <= 0:
        return None
    if value < 100:
        return value * 40 * 52
    elif value < 1000:
        return value * 5 * 52
    elif value < 10000:
        return value * 12
    else:
        return value

In [53]:
dataanalyst_safe["salary_min"] = dataanalyst_safe["salary_min_raw"].apply(adjust_salary)
dataanalyst_safe["salary_max"] = dataanalyst_safe["salary_max_raw"].apply(adjust_salary)


In [54]:
dataanalyst_safe["salary_min"] = dataanalyst_safe["salary_min"].replace(0, pd.NA)
dataanalyst_safe["salary_max"] = dataanalyst_safe["salary_max"].replace(0, pd.NA)


In [55]:
dataanalyst_safe["salary_avg"] = dataanalyst_safe[["salary_min", "salary_max"]].mean(axis=1)


In [56]:
dataanalyst_safe[["salary_min", "salary_max", "salary_avg"]].describe()


Unnamed: 0,salary_min,salary_max,salary_avg
count,583.0,589.0,589.0
mean,72190.680566,72153.028472,72192.046299
std,38156.075229,39961.28165,37586.631004
min,12000.0,12000.0,12000.0
25%,41786.755,42825.72,43144.11
50%,62700.07,60960.33,65786.48
75%,94825.425,92097.49,91487.34
max,323403.78,323403.78,323403.78


In [57]:
dataanalyst_safe.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills,salary_min_raw,salary_max_raw,salary_avg
0,2025-05-20T20:09:50Z,Team Lead (Finance),GB,IT Jobs,52996.15,52996.15,1,Bloomberg runs on data. Our products are fuele...,Onsite,r,52996.15,52996.15,52996.15
1,2025-05-20T20:09:11Z,Data Analyst,GB,IT Jobs,50000.0,50000.0,0,"Data Analyst - Up to £50,000 Hybrid Working (...",Hybrid,"excel, power bi, python, r, sql",50000.0,50000.0,50000.0
2,2025-05-20T19:49:22Z,Junior Data Analyst,GB,IT Jobs,24000.0,24000.0,0,"Junior Data Analyst / Starting salary of £24,0...",Onsite,r,24000.0,24000.0,24000.0
3,2025-05-20T19:49:22Z,Asset Data Analyst,GB,IT Jobs,35000.0,35000.0,0,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite,r,35000.0,35000.0,35000.0
4,2025-05-20T17:50:42Z,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r,43779.34,43779.34,43779.34


In [58]:
dataanalyst_safe.dtypes

creation_date        object
job_title            object
country              object
category             object
salary_min          float64
salary_max          float64
salary_predicted     object
description          object
work_mode            object
skills               object
salary_min_raw      float64
salary_max_raw      float64
salary_avg          float64
dtype: object

In [59]:
dataanalyst_safe["creation_date"] = pd.to_datetime(dataanalyst_safe["creation_date"], errors="coerce")

In [60]:
dataanalyst_safe["year"] = dataanalyst_safe["creation_date"].dt.year


In [61]:
print(dataanalyst_safe[["creation_date", "year"]].head())


              creation_date  year
0 2025-05-20 20:09:50+00:00  2025
1 2025-05-20 20:09:11+00:00  2025
2 2025-05-20 19:49:22+00:00  2025
3 2025-05-20 19:49:22+00:00  2025
4 2025-05-20 17:50:42+00:00  2025


In [62]:
dataanalyst_safe.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills,salary_min_raw,salary_max_raw,salary_avg,year
0,2025-05-20 20:09:50+00:00,Team Lead (Finance),GB,IT Jobs,52996.15,52996.15,1,Bloomberg runs on data. Our products are fuele...,Onsite,r,52996.15,52996.15,52996.15,2025
1,2025-05-20 20:09:11+00:00,Data Analyst,GB,IT Jobs,50000.0,50000.0,0,"Data Analyst - Up to £50,000 Hybrid Working (...",Hybrid,"excel, power bi, python, r, sql",50000.0,50000.0,50000.0,2025
2,2025-05-20 19:49:22+00:00,Junior Data Analyst,GB,IT Jobs,24000.0,24000.0,0,"Junior Data Analyst / Starting salary of £24,0...",Onsite,r,24000.0,24000.0,24000.0,2025
3,2025-05-20 19:49:22+00:00,Asset Data Analyst,GB,IT Jobs,35000.0,35000.0,0,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite,r,35000.0,35000.0,35000.0,2025
4,2025-05-20 17:50:42+00:00,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r,43779.34,43779.34,43779.34,2025


In [63]:
dataanalyst_safe_csv = dataanalyst_safe.to_csv("dataanalyst_cleaned.csv", index=False)

In [64]:
cleaned_df = pd.read_csv("dataanalyst_cleaned.csv")

In [65]:
cleaned_df.head()

Unnamed: 0,creation_date,job_title,country,category,salary_min,salary_max,salary_predicted,description,work_mode,skills,salary_min_raw,salary_max_raw,salary_avg,year
0,2025-05-20 20:09:50+00:00,Team Lead (Finance),GB,IT Jobs,52996.15,52996.15,1,Bloomberg runs on data. Our products are fuele...,Onsite,r,52996.15,52996.15,52996.15,2025
1,2025-05-20 20:09:11+00:00,Data Analyst,GB,IT Jobs,50000.0,50000.0,0,"Data Analyst - Up to £50,000 Hybrid Working (...",Hybrid,"excel, power bi, python, r, sql",50000.0,50000.0,50000.0,2025
2,2025-05-20 19:49:22+00:00,Junior Data Analyst,GB,IT Jobs,24000.0,24000.0,0,"Junior Data Analyst / Starting salary of £24,0...",Onsite,r,24000.0,24000.0,24000.0,2025
3,2025-05-20 19:49:22+00:00,Asset Data Analyst,GB,IT Jobs,35000.0,35000.0,0,"Asset Data Analyst SalaryUp to £39,248 Locatio...",Onsite,r,35000.0,35000.0,35000.0,2025
4,2025-05-20 17:50:42+00:00,Healthcare Analyst (FTC),GB,IT Jobs,43779.34,43779.34,1,Who we are… GlobalData is a specialist informa...,Onsite,r,43779.34,43779.34,43779.34,2025


In [69]:
jobs_cleaned_2 = cleaned_df.drop(columns=["creation_date", "category", "salary_min", "salary_max", "salary_predicted", "description", "salary_min_raw", "salary_max_raw"], errors="ignore")
jobs_cleaned_2.head()

Unnamed: 0,job_title,country,work_mode,skills,salary_avg,year
0,Team Lead (Finance),GB,Onsite,r,52996.15,2025
1,Data Analyst,GB,Hybrid,"excel, power bi, python, r, sql",50000.0,2025
2,Junior Data Analyst,GB,Onsite,r,24000.0,2025
3,Asset Data Analyst,GB,Onsite,r,35000.0,2025
4,Healthcare Analyst (FTC),GB,Onsite,r,43779.34,2025


In [70]:
jobs_cleaned_2.shape

(1250, 6)

In [71]:
jobs_cleaned_2 = jobs_cleaned_2.rename(columns={
    "salary_avg" : "salary_in_usd",
    "year": "date"
})


jobs_cleaned_2 = jobs_cleaned_2[[
    "date","job_title","salary_in_usd", "work_mode", "country", "skills"]]

jobs_cleaned_2

Unnamed: 0,date,job_title,salary_in_usd,work_mode,country,skills
0,2025,Team Lead (Finance),52996.15,Onsite,GB,r
1,2025,Data Analyst,50000.00,Hybrid,GB,"excel, power bi, python, r, sql"
2,2025,Junior Data Analyst,24000.00,Onsite,GB,r
3,2025,Asset Data Analyst,35000.00,Onsite,GB,r
4,2025,Healthcare Analyst (FTC),43779.34,Onsite,GB,r
...,...,...,...,...,...,...
1245,2025,Quant Analytics Associate - Card Acquisitions ...,127389.54,Onsite,US,r
1246,2025,Sr Analyst - Data,68025.95,Onsite,US,r
1247,2025,Sr Analyst - Data,74026.13,Onsite,US,r
1248,2025,Sr Analyst - Data,70282.45,Onsite,US,r


In [72]:
jobs_cleaned_2["job_title"].unique()

array(['Team Lead (Finance)', 'Data Analyst', 'Junior Data Analyst',
       'Asset Data Analyst', 'Healthcare Analyst (FTC)', 'DATA ANALYST',
       'Data Protection Analyst',
       'Remote Travel Business Development / Sales Coach - World 1 Franchise',
       'Market Data Admin and Compliance Support Analyst',
       'Commercial / Ecommerce Analyst', 'Senior Data Analyst',
       'Data Engineer', 'Data Scientist',
       'People Data and Performance Metrics Analyst', 'Senior MI Analyst',
       'Data Analysis Needed For Insights into a Time Management Survey',
       'Data Product Manager, Product Analytics, Global Streaming',
       'Data and Reporting Analyst', 'Data Governance Analyst',
       'Power BI Process & Data Lead (m/w/d)', 'Clinical Data Analyst',
       'Engineer - London Stock Exchange Group', 'Data Business Analyst',
       'Data Analyst Graduate with an interest in cars',
       'HR Insights Analyst - Oracle HCM',
       'STaR Specialist Engagement Worker',
       'A

In [73]:
jobs_cleaned_2.dtypes

date               int64
job_title         object
salary_in_usd    float64
work_mode         object
country           object
skills            object
dtype: object

In [74]:
jobs_cleaned_2safe = jobs_cleaned_2.copy()

In [75]:
jobs_cleaned_2["work_mode"] = (
    jobs_cleaned_2["work_mode"]
    .str.lower()
    .str.strip()
    .replace({
        "Onsite" : "on_site",
        "Hybrid": "hybrid",
        "Remote": "remote"
    })
)

jobs_cleaned_2

Unnamed: 0,date,job_title,salary_in_usd,work_mode,country,skills
0,2025,Team Lead (Finance),52996.15,onsite,GB,r
1,2025,Data Analyst,50000.00,hybrid,GB,"excel, power bi, python, r, sql"
2,2025,Junior Data Analyst,24000.00,onsite,GB,r
3,2025,Asset Data Analyst,35000.00,onsite,GB,r
4,2025,Healthcare Analyst (FTC),43779.34,onsite,GB,r
...,...,...,...,...,...,...
1245,2025,Quant Analytics Associate - Card Acquisitions ...,127389.54,onsite,US,r
1246,2025,Sr Analyst - Data,68025.95,onsite,US,r
1247,2025,Sr Analyst - Data,74026.13,onsite,US,r
1248,2025,Sr Analyst - Data,70282.45,onsite,US,r


In [76]:
jobs_cleaned_2["skills"] = jobs_cleaned_2["skills"].fillna("").apply(lambda x: [s.strip() for s in x.split(",") if s.strip()])
jobs_cleaned_2

Unnamed: 0,date,job_title,salary_in_usd,work_mode,country,skills
0,2025,Team Lead (Finance),52996.15,onsite,GB,[r]
1,2025,Data Analyst,50000.00,hybrid,GB,"[excel, power bi, python, r, sql]"
2,2025,Junior Data Analyst,24000.00,onsite,GB,[r]
3,2025,Asset Data Analyst,35000.00,onsite,GB,[r]
4,2025,Healthcare Analyst (FTC),43779.34,onsite,GB,[r]
...,...,...,...,...,...,...
1245,2025,Quant Analytics Associate - Card Acquisitions ...,127389.54,onsite,US,[r]
1246,2025,Sr Analyst - Data,68025.95,onsite,US,[r]
1247,2025,Sr Analyst - Data,74026.13,onsite,US,[r]
1248,2025,Sr Analyst - Data,70282.45,onsite,US,[r]


In [77]:
jobs_cleaned_2["salary_in_usd"] = jobs_cleaned_2["salary_in_usd"].round(0).astype("Int64")
jobs_cleaned_2

Unnamed: 0,date,job_title,salary_in_usd,work_mode,country,skills
0,2025,Team Lead (Finance),52996,onsite,GB,[r]
1,2025,Data Analyst,50000,hybrid,GB,"[excel, power bi, python, r, sql]"
2,2025,Junior Data Analyst,24000,onsite,GB,[r]
3,2025,Asset Data Analyst,35000,onsite,GB,[r]
4,2025,Healthcare Analyst (FTC),43779,onsite,GB,[r]
...,...,...,...,...,...,...
1245,2025,Quant Analytics Associate - Card Acquisitions ...,127390,onsite,US,[r]
1246,2025,Sr Analyst - Data,68026,onsite,US,[r]
1247,2025,Sr Analyst - Data,74026,onsite,US,[r]
1248,2025,Sr Analyst - Data,70282,onsite,US,[r]


In [78]:
jobs_cleaned_2.to_csv("jobs_cleaned_sofia.csv", index=False)