In [1]:
%load_ext jupyter_black

In [39]:
import pickle
from google.cloud import bigquery
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
from collections import namedtuple
from typing import Iterable, Literal
from copy import deepcopy
import re

In [3]:
import warnings
import sys
import os

In [4]:
from collections import namedtuple

In [5]:
project = "x-avenue-450615-c3"
dataset_name = "jobs_postings"
location = "europe-west1"

bq_client = bigquery.Client()

In [6]:
CURRENT_DIRECTORY = os.getcwd()
PROJECT_ROOT = os.path.abspath(
    os.path.join(CURRENT_DIRECTORY, "../pipelines/rapidapi_jobs_posting/")
)
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
    print(f"{PROJECT_ROOT} was added to sys.path")

REPO_ROOT = os.path.abspath(os.path.join(CURRENT_DIRECTORY, ".."))
if REPO_ROOT not in sys.path:
    sys.path.append(REPO_ROOT)
    print(f"{REPO_ROOT} was added to sys.path")

/home/jovyan/work/repos/jobs-research/pipelines/rapidapi_jobs_posting was added to sys.path
/home/jovyan/work/repos/jobs-research was added to sys.path


In [7]:
from common.utils import bq_table_to_df

In [8]:
sheet_url = "https://docs.google.com/spreadsheets/d/1clAiWIVMD5bCJRHJr9-p2vw9h99W5sByAtqThIGREpo/edit?gid=0#gid=0"
csv_export_url = sheet_url.replace("/edit?gid=", "/export?format=csv&gid=")

In [205]:
csv_export_url

'https://docs.google.com/spreadsheets/d/1clAiWIVMD5bCJRHJr9-p2vw9h99W5sByAtqThIGREpo/export?format=csv&gid=0#gid=0'

In [206]:
rules_df = pd.read_csv(csv_export_url)

In [207]:
rules_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   keyword           18 non-null     object
 1   result            79 non-null     object
 2   case_sensitive    79 non-null     bool  
 3   spaces_sensitive  79 non-null     bool  
dtypes: bool(2), object(2)
memory usage: 1.5+ KB


In [208]:
REPLACE_WITH_SPACES = r"[!\"$\%'()\+,\-./:;?]"


def prepare_text(
    text: str,
    case_sensitive: bool,
    spaces_sensitive: bool,
    replace_with_spaces=REPLACE_WITH_SPACES,
) -> str:
    if not case_sensitive:
        text = text.lower()
    if not spaces_sensitive:
        text = text.replace(" ", "")
    else:

        text = re.sub(replace_with_spaces, " ", text)
        text = " " + text.strip() + " "
    return text

In [209]:
class MappingDict:
    def __init__(
        self,
        rules: dict,
        case_sensitive: bool = False,
        spaces_sensitive: bool = False,
    ):
        self.rules = rules
        self.case_sensitive = case_sensitive
        self.spaces_sensitive = spaces_sensitive
        self.is_prepared = False

    def prepare(self):
        if not self.is_prepared:
            prepared_rules = {}
            for key, val in self.rules.items():
                prepared_key = prepare_text(
                    key, self.case_sensitive, self.spaces_sensitive
                )
                prepared_rules[prepared_key] = val
                for char in REPLACE_WITH_SPACES:
                    if self.spaces_sensitive and prepared_key != key and char in key:
                        warnings.warn(
                            f"'{char}' in the keyword '{key}' was replaced with a space.",
                            UserWarning,
                        )
            self.rules = prepared_rules
            self.is_prepared = True
        else:
            warnings.warn("MappingDict is already prepared.", UserWarning)
        return self

In [228]:
class MappingRules:
    def __init__(self, attr_name: str, rules_df: pd.DataFrame):
        self.attr_name = attr_name
        self.rules_df = rules_df[
            ["keyword", "result", "case_sensitive", "spaces_sensitive"]
        ]

        self._check_for_nulls()
        self.rules_df["keyword"] = self.rules_df.apply(
            lambda x: x.result if pd.isna(x.keyword) else x.keyword, axis="columns"
        )
        self._check_keywords_uniqueness()
        self._is_prepared = False

    def _check_for_nulls(self):
        for col in ["result", "case_sensitive", "spaces_sensitive"]:
            if rules_df[col].isna().sum() > 0:
                raise ValueError(
                    f"'{col}' can not be Null. Fix mapping rules for {self.attr_name}."
                )

    def _check_keywords_uniqueness(self):
        non_unique_keywords = self.rules_df["keyword"].value_counts()[lambda x: x > 1]
        if len(non_unique_keywords) > 0:
            non_unique_keywords_str = ", ".join(non_unique_keywords.index)
            raise ValueError(
                f"Keywords must be unique. Fix mapping rules for {attr_name}: {non_unique_keywords_str}."
            )

    def prepare(self):
        if not self._is_prepared:
            self.map_dicts = list(
                self.rules_df.set_index("keyword")
                .groupby(by=["case_sensitive", "spaces_sensitive"])
                .agg(dict)
                .rename({"result": "mapping_dict"}, axis="columns")
                .reset_index()
                .apply(
                    lambda x: MappingDict(
                        rules=x.mapping_dict,
                        case_sensitive=x.case_sensitive,
                        spaces_sensitive=x.spaces_sensitive,
                    ),
                    axis="columns",
                )
            )
            self.map_dicts_prepared = [
                mapping_dict.prepare() for mapping_dict in self.map_dicts
            ]
            self._is_prepared = True
        else:
            warnings.warn("MappingRules are already prepared.", UserWarning)

    def apply(
        self, texts: Iterable[str], find: Literal["any", "all"] = "all"
    ) -> str | set[str] | None:

        if not self._is_prepared:
            self.prepare()

        if find == "all":
            result = set()
        if find == "any":
            result = None

        for mapping_dict in self.map_dicts_prepared:
            for text in texts:
                text = prepare_text(
                    text, mapping_dict.case_sensitive, mapping_dict.spaces_sensitive
                )
                for key, val in mapping_dict.rules.items():
                    if key in text:
                        print(key, val)
                        if find == "all":
                            result.add(val)
                        if find == "any":
                            return val
        return result

In [229]:
skills_rules = MappingRules("skills", rules_df)

In [230]:
skills_rules.apply(["BI"], "all")

 BI  BI


{'BI'}

In [231]:
skills_rules.apply([" BI "], "all")

 BI  BI


{'BI'}

In [232]:
skills_rules.apply(["hI BI!"], "all")

 BI  BI


{'BI'}

In [233]:
skills_rules.apply([" bi "], "all")

set()

In [234]:
skills_rules.apply([" BINGO!"], "all")

set()

In [235]:
skills_rules.apply(["BigQuery"], "all")

bigquery Google BigQuery


{'Google BigQuery'}

In [236]:
skills_rules.apply(["(BigQuery)"], "all")

bigquery Google BigQuery


{'Google BigQuery'}

In [237]:
skills_rules.apply(["(Big Query)"], "all")

bigquery Google BigQuery


{'Google BigQuery'}

In [238]:
skills_rules.apply(["piu-piu(bigquery/Hive/CosmosDB)"], "all")

cosmosdb Azure Cosmos
bigquery Google BigQuery
hive Hive


{'Azure Cosmos', 'Google BigQuery', 'Hive'}

In [119]:
df_posting_load_query = f"""
select *
from `x-avenue-450615-c3.jp.jobs` as jp
"""
df_posting = bq_client.query(df_posting_load_query).to_dataframe()

In [239]:
df_posting.sample(10).description.map(lambda x: skills_rules.apply([x]))

spark Spark
unity Unity
llm LLM
pytorch PyTorch
tensorflow TensorFlow
unity Unity
powerbi PowerBI
unity Unity
llm LLM
 BI  BI
unity Unity
unity Unity
unity Unity
hadoop Hadoop
spark Spark
unity Unity
sql SQL
python Python
 S3  AWS S3
 BI  BI


2595                                     {Spark, Unity}
873                          {PyTorch, TensorFlow, LLM}
1739                                            {Unity}
1906                          {LLM, BI, Unity, PowerBI}
1005                                            {Unity}
3365                                                 {}
3826                                            {Unity}
2700                                            {Unity}
3758                                                 {}
3918    {Python, Hadoop, AWS S3, SQL, Unity, BI, Spark}
Name: description, dtype: object

In [222]:
skills_rules.apply([df_posting.description.iloc[1058]], "all")

set()