In [13]:
# <-- Import libraries, custom functions, and load configuration & datasets <--

import yaml
import pandas as pd
import numpy as np
import datetime as dt
import re

# <-- Imports custom preprocessing functions from 'functions.py' <--

from functions import (drop_duplicates,
                       concat_dataframes,
                       remove_all_punctuation,
                       drop_irrelevant_columns,
                       standardize_column_names,
                       filter_by_regex_pattern,
                       standardize_dates
                       )

# <-- Loads YAML configuration to dynamically reference CSV output files. <--

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")
config

job_nyc1 = pd.read_csv(config['input_data']['file1'])
job_nyc2 = pd.read_csv(config['input_data']['file2'])

In [None]:
job_nyc1.info()


In [2]:
job_nyc1["Preferred Skills"].isnull().sum()

np.int64(1375)

In [3]:
job_nyc2["Preferred Skills"].isnull().sum()

np.int64(2699)

In [4]:
job_nyc1["Preferred Skills"].value_counts(dropna=False)

Preferred Skills
NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [None]:
display(pd.DataFrame({
    "non_nulls": job_nyc1.count(),
    "nulls": job_nyc1.isnull().sum(),
    "total": len(job_nyc1)
}))

In [None]:
display(pd.DataFrame({
    "non_nulls": job_nyc2.count(),
    "nulls": job_nyc2.isnull().sum(),
    "total": len(job_nyc2)
}))

In [14]:
df_merged = concat_dataframes(job_nyc1,job_nyc2)

In [15]:
df_merged = standardize_column_names(df_merged) 

In [16]:
display(df_merged.columns)

Index(['job_id', 'agency', 'posting_type', '#_of_positions', 'business_title',
       'civil_service_title', 'title_classification', 'title_code_no', 'level',
       'job_category', 'full-time/part-time_indicator', 'career_level',
       'salary_range_from', 'salary_range_to', 'salary_frequency',
       'work_location', 'division/work_unit', 'job_description',
       'minimum_qual_requirements', 'preferred_skills',
       'additional_information', 'to_apply', 'hours/shift', 'work_location_1',
       'recruitment_contact', 'residency_requirement', 'posting_date',
       'post_until', 'posting_updated', 'process_date'],
      dtype='object')

In [None]:
df_merged = drop_duplicates(df_merged,'job_id')

In [None]:
columns = ['job_id', 'posting_type', 'civil_service_title', 'title_classification', 
'title_code_no', 'full-time/part-time_indicator', 'work_location', 'division/work_unit',
'job_description', 'minimum_qual_requirements', 'additional_information', 'to_apply', 
'hours/shift', 'work_location_1', 'recruitment_contact', 'residency_requirement', 'posting_updated',
'process_date']

df_merged = drop_irrelevant_columns(df_merged, columns)

In [None]:
display(df_merged.columns)

In [None]:
display(df_merged.isnull().sum())

In [None]:
display(pd.DataFrame({
    "non_nulls": df_merged.count(),
    "nulls": df_merged.isnull().sum(),
    "total": len(df_merged)
}))

In [None]:
cat_cols = list(df_merged.select_dtypes(include=['object']))
cat_cols

In [None]:
# Clean up categorical columns (type = 'object') except dates
cat_cols = ['agency',
 'business_title',
 'level',
 'job_category',
 'career_level',
 'salary_frequency',
 'preferred_skills']
 
df_merged = remove_all_punctuation(df_merged,cat_cols)

In [None]:
df_merged[['preferred_skills','job_category','career_level']]

In [None]:
df_merged[['posting_date', 'post_until']]

In [None]:
date_cols = ['posting_date', 'post_until']
df_merged = standardize_dates(df_merged,date_cols)

In [None]:
df_merged[['posting_date', 'post_until']]

In [None]:
df_merged.info()

In [None]:
display(df_merged['business_title'])

In [None]:
regex_pattern = r"\b(data analyst|data engineer|data analyst)\w*\b"
df_data_analyst = filter_by_regex_pattern(df_merged,'business_title', regex_pattern)

In [None]:
df_data_analyst.info()

In [None]:
display(pd.DataFrame({
    "non_nulls": df_data_analyst.count(),
    "nulls": df_data_analyst.isnull().sum(),
    "total": len(df_data_analyst)
}))

In [None]:
regex_pattern = r"(sql|tableau|bi|phyton|eda|llm|ai|ml|pandas|numpy|agile)"
df_keywords = filter_by_regex_pattern(df_merged,'preferred_skills', regex_pattern)

In [None]:
df_keywords.info()

In [None]:
display(pd.DataFrame({
    "non_nulls": df_keywords.count(),
    "nulls": df_keywords.isnull().sum(),
    "total": len(df_keywords)
}))

In [None]:
df_merged.to_csv(config['output_data']['file1'], index=False, sep=",", encoding="utf-8")
df_data_analyst.to_csv(config['output_data']['file2'], index=False, sep=",", encoding="utf-8")
df_keywords.to_csv(config['output_data']['file3'], index=False, sep=",", encoding="utf-8")