In [2]:
# <-- Import libraries, custom functions, and load configuration & datasets <--

import yaml

import pandas as pd
import numpy as np
import datetime as dt
import re

# <-- Imports custom preprocessing functions from 'functions.py' <--

from functions import (drop_duplicates,
                       concat_dataframes,
                       remove_all_punctuation,
                       drop_irrelevant_columns,
                       standardize_column_names,
                       filter_by_regex_pattern,
                       standardize_dates
                       )

# <-- Loads YAML configuration to dynamically reference CSV output files. <--

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")
config

job_nyc1 = pd.read_csv(config['input_data']['file1'])
job_nyc2 = pd.read_csv(config['input_data']['file2'])

In [3]:
df_merged = concat_dataframes(job_nyc1,job_nyc2)

In [4]:
df_merged = standardize_column_names(df_merged) 

In [5]:
df_merged = drop_duplicates(df_merged,'job_id')

In [6]:
columns = ['job_id', 'posting_type', 'civil_service_title', 'title_classification', 
'title_code_no', 'full-time/part-time_indicator', 'work_location', 'division/work_unit',
'job_description', 'minimum_qual_requirements', 'additional_information', 'to_apply', 
'hours/shift', 'work_location_1', 'recruitment_contact', 'residency_requirement', 'posting_updated',
'process_date']

df_merged = drop_irrelevant_columns(df_merged, columns)

In [7]:
cat_cols = list(df_merged.select_dtypes(include=['object']))
cat_cols

['agency',
 'business_title',
 'level',
 'job_category',
 'career_level',
 'salary_frequency',
 'preferred_skills',
 'posting_date',
 'post_until']

In [8]:
cat_cols = ['agency',
 'business_title',
 'level',
 'job_category',
 'career_level',
 'salary_frequency',
 'preferred_skills']
 
df_merged = remove_all_punctuation(df_merged,cat_cols)

In [9]:
date_cols = ['posting_date', 'post_until']
df_merged = standardize_dates(df_merged,date_cols)

  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)


In [10]:
df_merged[['preferred_skills','job_category','career_level']]

Unnamed: 0,preferred_skills,job_category,career_level
0,must have a valid pe license,engineering architecture planning,experienced nonmanager
1,student must currently be enrolled at a colleg...,engineering architecture planning,student
2,,communications intergovernmental affairs techn...,experienced nonmanager
3,a baccalaureate degree from an accredited coll...,engineering architecture planning,experienced nonmanager
4,preference will be given to candidates with at...,engineering architecture planning,manager
...,...,...,...
10906,,administration human resources,experienced nonmanager
10957,,engineering architecture planning public safet...,experienced nonmanager
10961,,communications intergovernmental affairs,manager
11037,,communications intergovernmental affairs,manager


In [11]:
nunique_df = pd.DataFrame({
    'column': cat_cols,
    'nunique': [df_merged[col].nunique() for col in cat_cols]
})

nunique_df

Unnamed: 0,column,nunique
0,agency,67
1,business_title,3180
2,level,20
3,job_category,254
4,career_level,5
5,salary_frequency,3
6,preferred_skills,2527


In [12]:
df_merged['agency'].unique()

array(['dept of environment protection', 'bronx district attorney',
       'office of criminal justice', 'dept of design construction',
       'department of transportation', 'nyc housing authority',
       'dept of healthmental hygiene', 'nyc employees retirement sys',
       'housing preservation dvlpmnt', 'hradept of social services',
       'consumer and worker protection', 'dept of parks recreation',
       'law department', 'district attorney kings county',
       'dept of homeless services', 'department of correction',
       'nyc police pension fund', 'office of the comptroller',
       'office of emergency management', 'department of finance',
       'manhattan community board 12', 'admin for childrens svcs',
       'office of management budget', 'campaign finance board',
       'district attorneymanhattan', 'police department',
       'department of city planning', 'taxi limousine commission',
       'off of payroll administration', 'borough presidentbronx',
       'human rig

In [20]:
sorted(df_merged['level'].unique())

['00',
 '01',
 '02',
 '03',
 '04',
 '1a',
 '1b',
 '3a',
 '3b',
 '4a',
 '4b',
 'm1',
 'm2',
 'm3',
 'm4',
 'm5',
 'm6',
 'm7',
 'm8',
 'my']

In [14]:
df_merged['career_level'].unique()

array(['experienced nonmanager', 'student', 'manager', 'entrylevel',
       'executive'], dtype=object)

In [15]:
df_merged['salary_frequency'].unique()

array(['annual', 'hourly', 'daily'], dtype=object)

In [16]:
# Remove negative salaries
df_merged[(df_merged['salary_range_from'] <= 0) | (df_merged['salary_range_to'] <= 0)]
df_merged['salary_range_to'].max()
# df_data_analyst = df_data_analyst[(df_data_analyst['salary_range_from'] >= 0) & (df_data_analyst['salary_range_to'] >= 0)]
# df_keywords = df_keywords[(df_keywords['salary_range_from'] >= 0) & (df_keywords['salary_range_to'] >= 0)]

np.float64(293038.0)

In [17]:
display(pd.DataFrame({
    "non_nulls": df_merged.count(),
    "nulls": df_merged.isnull().sum(),
    "total": len(df_merged)
}))

Unnamed: 0,non_nulls,nulls,total
agency,5271,0,5271
#_of_positions,5271,0,5271
business_title,5271,0,5271
level,5271,0,5271
job_category,5271,0,5271
career_level,5271,0,5271
salary_range_from,5271,0,5271
salary_range_to,5271,0,5271
salary_frequency,5271,0,5271
preferred_skills,3338,1933,5271


In [18]:
# Create a salary midpoint column and sector (job_category) normalization

df_merged['salary_midpoint'] = df_merged[['salary_range_from', 'salary_range_to']].mean(axis=1)
df_merged['posting_year'] = df_merged['posting_date'].dt.year
#df_keywords['posting_month'] = df_keywords['posting_date'].dt.to_period('M')

# Create a boolean mask for IT/Cyber/Data jobs using regex on job_category or business_title
# Adjust the regex pattern as needed for the job market context
it_pattern = r'(it|information technology|cyber|security|data|analyst|engineer|developer|software|bi|ml|ai|python|sql|cloud)'
df_merged['is_tech'] = df_merged['job_category'].str.contains(it_pattern, na=False, case=False) | df_merged['business_title'].str.contains(it_pattern, na=False, case=False)

# For skills analysis, create a list of keywords for technical and soft skills:
tech_skills = ['python', 'sql', 'tableau', 'bi', 'cyber', 'eda', 'security', 'machine learning', 'llm', 'ml', 'ai', 'pandas', 'numpy', 'cloud', 'agile', 'java', 'c++', 'linux']
soft_skills = ['communication', 'teamwork', 'leadership', 'collaboration', 'problem solving', 'organization', 'management', 'writing', 'presentation']

  df_merged['is_tech'] = df_merged['job_category'].str.contains(it_pattern, na=False, case=False) | df_merged['business_title'].str.contains(it_pattern, na=False, case=False)
