In [1]:
%load_ext jupyter_black

In [2]:
import pickle
from google.cloud import bigquery
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
from collections import namedtuple
from typing import Iterable, Literal
from copy import deepcopy
import re

In [3]:
import warnings
import sys
import os

In [4]:
from collections import namedtuple

In [5]:
project = "x-avenue-450615-c3"
dataset_name = "jobs_postings"
location = "europe-west1"

bq_client = bigquery.Client()

In [6]:
CURRENT_DIRECTORY = os.getcwd()
PROJECT_ROOT = os.path.abspath(
    os.path.join(CURRENT_DIRECTORY, "../pipelines/rapidapi_jobs_posting/")
)
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
    print(f"{PROJECT_ROOT} was added to sys.path")

REPO_ROOT = os.path.abspath(os.path.join(CURRENT_DIRECTORY, ".."))
if REPO_ROOT not in sys.path:
    sys.path.append(REPO_ROOT)
    print(f"{REPO_ROOT} was added to sys.path")

/home/jovyan/work/repos/jobs-research/pipelines/rapidapi_jobs_posting was added to sys.path
/home/jovyan/work/repos/jobs-research was added to sys.path


In [7]:
from common.utils import bq_table_to_df

In [8]:
sheet_url = "https://docs.google.com/spreadsheets/d/1clAiWIVMD5bCJRHJr9-p2vw9h99W5sByAtqThIGREpo/edit?gid=0#gid=0"
csv_export_url = sheet_url.replace("/edit?gid=", "/export?format=csv&gid=")

In [110]:
csv_export_url

'https://docs.google.com/spreadsheets/d/1clAiWIVMD5bCJRHJr9-p2vw9h99W5sByAtqThIGREpo/export?format=csv&gid=0#gid=0'

In [231]:
rules_df = pd.read_csv(csv_export_url)

In [232]:
rules_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113 entries, 0 to 112
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   keyword           37 non-null     object
 1   result            113 non-null    object
 2   case_sensitive    113 non-null    bool  
 3   spaces_sensitive  113 non-null    bool  
dtypes: bool(2), object(2)
memory usage: 2.1+ KB


In [233]:
skills_set = rules_df.result.unique()

cloud_skills_sets = dict()

cloud_skills_sets["Google Cloud Platform"] = {x for x in skills_set if "Google" in x}
cloud_skills_sets["Microsoft Azure"] = {x for x in skills_set if "Azure" in x}
cloud_skills_sets["Amazon Web Services"] = {x for x in skills_set if "Amazon" in x}

cloud_skills_sets

{'Google Cloud Platform': {'Google BigQuery',
  'Google Cloud Platform',
  'Google Cloud Run',
  'Google Looker Studio',
  'Google Storage',
  'Google Vertex AI'},
 'Microsoft Azure': {'Azure Blob Storage',
  'Azure Cosmos',
  'Azure Data Factory',
  'Azure Function',
  'Azure Synapse Analytics',
  'Microsoft Azure'},
 'Amazon Web Services': {'Amazon Athena',
  'Amazon Firehose',
  'Amazon Glue',
  'Amazon Redshift',
  'Amazon S3',
  'Amazon Web Services'}}

In [207]:
REPLACE_WITH_SPACES = r"[!\"$\%'()\+,\-./:;?]"


def prepare_text(
    text: str,
    case_sensitive: bool,
    spaces_sensitive: bool,
    replace_with_spaces=REPLACE_WITH_SPACES,
) -> str:
    if not case_sensitive:
        text = text.lower()
    if not spaces_sensitive:
        text = text.replace(" ", "")
    else:

        text = re.sub(replace_with_spaces, " ", text)
        text = " " + text.strip() + " "
    return text

In [208]:
class MappingDict:
    def __init__(
        self,
        rules: dict,
        case_sensitive: bool = False,
        spaces_sensitive: bool = False,
    ):
        self.rules = rules
        self.case_sensitive = case_sensitive
        self.spaces_sensitive = spaces_sensitive
        self.is_prepared = False

    def prepare(self):
        if not self.is_prepared:
            prepared_rules = {}
            for key, val in self.rules.items():
                prepared_key = prepare_text(
                    key, self.case_sensitive, self.spaces_sensitive
                )
                prepared_rules[prepared_key] = val
                for char in REPLACE_WITH_SPACES:
                    if self.spaces_sensitive and prepared_key != key and char in key:
                        warnings.warn(
                            f"'{char}' in the keyword '{key}' was replaced with a space.",
                            UserWarning,
                        )
            self.rules = prepared_rules
            self.is_prepared = True
        else:
            warnings.warn("MappingDict is already prepared.", UserWarning)
        return self

In [209]:
class MappingRules:
    def __init__(self, attr_name: str, rules_df: pd.DataFrame):
        self.attr_name = attr_name
        self.rules_df = rules_df[
            ["keyword", "result", "case_sensitive", "spaces_sensitive"]
        ]

        self._check_for_nulls()
        self.rules_df["keyword"] = self.rules_df.apply(
            lambda x: x.result if pd.isna(x.keyword) else x.keyword, axis="columns"
        )
        self._check_keywords_uniqueness()
        self._is_prepared = False

    def _check_for_nulls(self):
        for col in ["result", "case_sensitive", "spaces_sensitive"]:
            if rules_df[col].isna().sum() > 0:
                raise ValueError(
                    f"'{col}' can not be Null. Fix mapping rules for {self.attr_name}."
                )

    def _check_keywords_uniqueness(self):
        non_unique_keywords = self.rules_df["keyword"].value_counts()[lambda x: x > 1]
        if len(non_unique_keywords) > 0:
            non_unique_keywords_str = ", ".join(non_unique_keywords.index)
            raise ValueError(
                f"Keywords must be unique. Fix mapping rules for {self.attr_name}: {non_unique_keywords_str}."
            )

    def prepare(self):
        if not self._is_prepared:
            self.map_dicts = list(
                self.rules_df.set_index("keyword")
                .groupby(by=["case_sensitive", "spaces_sensitive"])
                .agg(dict)
                .rename({"result": "mapping_dict"}, axis="columns")
                .reset_index()
                .apply(
                    lambda x: MappingDict(
                        rules=x.mapping_dict,
                        case_sensitive=x.case_sensitive,
                        spaces_sensitive=x.spaces_sensitive,
                    ),
                    axis="columns",
                )
            )
            self.map_dicts_prepared = [
                mapping_dict.prepare() for mapping_dict in self.map_dicts
            ]
            self._is_prepared = True
        else:
            warnings.warn("MappingRules are already prepared.", UserWarning)

    def apply(
        self, texts: Iterable[str], find: Literal["any", "all"] = "all"
    ) -> str | set[str] | None:

        if not self._is_prepared:
            self.prepare()

        if find == "all":
            result = set()
        if find == "any":
            result = None

        for mapping_dict in self.map_dicts_prepared:
            for text in texts:
                text = prepare_text(
                    text, mapping_dict.case_sensitive, mapping_dict.spaces_sensitive
                )
                for key, val in mapping_dict.rules.items():
                    if key in text:
                        if find == "all":
                            result.add(val)
                        if find == "any":
                            return val
        return result

In [210]:
skills_rules = MappingRules("skills", rules_df)

In [211]:
skills_rules.apply(["BI"], "all")

{'BI'}

In [212]:
skills_rules.apply([" BI "], "all")

{'BI'}

In [213]:
skills_rules.apply(["hI BI!"], "all")

{'BI'}

In [214]:
skills_rules.apply([" bi "], "all")

set()

In [215]:
skills_rules.apply([" BINGO!"], "all")

set()

In [216]:
skills_rules.apply(["BigQuery"], "all")

{'Google BigQuery'}

In [217]:
skills_rules.apply(["(BigQuery)"], "all")

{'Google BigQuery'}

In [218]:
skills_rules.apply(["(Big Query)"], "all")

{'Google BigQuery'}

In [219]:
skills_rules.apply(["(JavaScRiPt "], "all")

{'JavaScript'}

In [220]:
skills_rules.apply(["(Java ScRiPt "], "all")

{'Java'}

In [221]:
skills_rules.apply(["piu-piu(bigquery/Hive/CosmosDB)"], "all")

{'Azure Cosmos', 'Google BigQuery', 'Hive'}

In [40]:
df_posting_load_query = f"""
select *
from `x-avenue-450615-c3.jp.jobs` as jp
"""
df_posting = bq_client.query(df_posting_load_query).to_dataframe()

In [223]:
df_posting["skills"] = df_posting.description.map(
    lambda x: set() if pd.isna(x) else skills_rules.apply([x])
)

In [224]:
df_posting.position.value_counts()

position
Data Scientist                           877
Data Analyst                             779
Data Engineer                            592
Data Protection/Governance Specialist    139
Architect                                134
Consultant/Advisor                       100
Software Engineer                         82
Data Manager                              65
Data Entry Specialist                     54
IaC Specialist                            50
Project Manager                           47
Product Manager                           46
Data Quality Specialist                   36
Facility Engineer                         30
Product Owner                             24
ML Ops                                     9
Tutor/Teacher                              2
Network Engineer                           1
Name: count, dtype: int64

In [225]:
[
    print(x[1].skills, "\n" * 2, x[1].description, "\n" * 5)
    for x in (
        df_posting[df_posting.position == "Data Engineer"][["skills", "description"]]
        .sample(20)
        .iterrows()
    )
]
_

{'Python', 'ML', 'SQL', 'Deep learning', 'PostgreSQL', 'Azure'} 

 What we are doing

Our mission is to prevent vision loss and ultimately blindness by developing AI software that assists eye doctors in therapy planning for their patients. We use computer vision deep learning models(AI) trained on thousands of cases and millions of images to identify and predict disease progression. The decision support algorithm targets common eye diseases like Age-related Macular Degeneration (AMD, 7.5mio affected in Germany alone).

Tasks {#tasks}
- -------------

Your responsibilities

* You are responsible for building and optimizing data pipelines to bring together information from different source systems, i.e. imaging and clinical data
* You prepare data for analytical or operational uses, i.e. by integration, consolidation, cleansing, and structuring of data
* You bring curiosity, problem-solving instinct and statistical and mathematical knowledge.

Requirements {#requirements}
- -------------

position
Data Scientist                           877
Data Analyst                             779
Data Engineer                            592
Data Protection/Governance Specialist    139
Architect                                134
Consultant/Advisor                       100
Software Engineer                         82
Data Manager                              65
Data Entry Specialist                     54
IaC Specialist                            50
Project Manager                           47
Product Manager                           46
Data Quality Specialist                   36
Facility Engineer                         30
Product Owner                             24
ML Ops                                     9
Tutor/Teacher                              2
Network Engineer                           1
Name: count, dtype: int64

In [240]:
set((1, 2)) & set((2, 3))

{2}

In [241]:
for cloud, skills_set in cloud_skills_sets.items():
    df_posting["skills"] = df_posting["skills"].map(
        lambda x: x | {cloud} if x & skills_set else x
    )

In [242]:
df_skills = (
    df_posting[df_posting["position"] == "Data Engineer"][["id", "position", "skills"]]
    .explode("skills")
    .dropna()
)

In [243]:
df_skills.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5359 entries, 5 to 2546
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5359 non-null   object
 1   position  5359 non-null   object
 2   skills    5359 non-null   object
dtypes: object(3)
memory usage: 167.5+ KB


In [249]:
df_skills.skills.value_counts().head(25)

skills
Python                   481
SQL                      413
Cloud                    373
Microsoft Excel          244
Amazon Web Services      244
Azure                    222
Spark                    175
Airflow                  150
Databricks               134
BI                       133
Agile                    126
Apache Kafka             117
Snowflake                117
Google Cloud Platform    107
Terraform                102
DevOps                   102
Docker                    89
Orchestration             89
ML                        87
Kubernetes                85
Java                      83
PostgreSQL                82
Google BigQuery           78
Streaming                 74
NoSQL                     69
Name: count, dtype: int64

In [247]:
df_skills.skills.value_counts().tail(10)

skills
QlikView              2
MDM                   2
Azure Blob Storage    2
Columnar DBMS         1
Azure Function        1
DLT                   1
SISTRA                1
Power Query           1
VBA                   1
UC                    1
Name: count, dtype: int64

{'Google BigQuery', 'Google Cloud Platform'}

({'Google BigQuery', 'Google Cloud Platform'},
 {'Azure',
  'Azure Blob Storage',
  'Azure Cosmos',
  'Azure Data Factory',
  'Azure Synapse Analytics'},
 {'Amazon Athena',
  'Amazon Firehose',
  'Amazon Glue',
  'Amazon Redshift',
  'Amazon S3',
  'Amazon Web Services'})