In [70]:
import yaml

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from functions import (drop_duplicates,
                       concat_dataframes,
                       remove_all_punctuation,
                       drop_irrelevant_columns,
                       standardize_column_names,
                       filter_by_regex_pattern,
                       standardize_dates
                       )

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")
config
if config is not None:
    job_nyc1 = pd.read_csv(config['input_data']['file1'])
    job_nyc2 = pd.read_csv(config['input_data']['file2'])
else:
    print("Cannot load data files because config is missing.")


In [None]:
display(job_nyc1)


In [2]:
job_nyc1["Preferred Skills"].isnull().sum()

np.int64(1375)

In [None]:
job_nyc2["Preferred Skills"].isnull().sum()

In [None]:
# job_nyc["Preferred Skills"].value_counts(dropna=False)

In [15]:
display(pd.DataFrame({
    "non_nulls": job_nyc1.count(),
    "nulls": job_nyc1.isnull().sum(),
    "total": len(job_nyc1)
}))

Unnamed: 0,non_nulls,nulls,total
Job ID,5069,0,5069
Agency,5069,0,5069
Posting Type,5069,0,5069
# Of Positions,5069,0,5069
Business Title,5069,0,5069
Civil Service Title,5069,0,5069
Title Classification,5069,0,5069
Title Code No,5069,0,5069
Level,5069,0,5069
Job Category,5069,0,5069


In [16]:
display(pd.DataFrame({
    "non_nulls": job_nyc2.count(),
    "nulls": job_nyc2.isnull().sum(),
    "total": len(job_nyc2)
}))

Unnamed: 0,non_nulls,nulls,total
Job ID,6030,0,6030
Agency,6030,0,6030
Posting Type,6030,0,6030
# Of Positions,6030,0,6030
Business Title,6030,0,6030
Civil Service Title,6030,0,6030
Title Classification,6030,0,6030
Title Code No,6030,0,6030
Level,6030,0,6030
Job Category,6030,0,6030


In [71]:
# merged = pd.concat([job_nyc1,job_nyc2], ignore_index = True)
df_merged = concat_dataframes(job_nyc1,job_nyc2)

In [72]:
# Standardize column names
df_merged = standardize_column_names(df_merged) 

In [73]:
display(df_merged.columns)

Index(['job_id', 'agency', 'posting_type', '#_of_positions', 'business_title',
       'civil_service_title', 'title_classification', 'title_code_no', 'level',
       'job_category', 'full-time/part-time_indicator', 'career_level',
       'salary_range_from', 'salary_range_to', 'salary_frequency',
       'work_location', 'division/work_unit', 'job_description',
       'minimum_qual_requirements', 'preferred_skills',
       'additional_information', 'to_apply', 'hours/shift', 'work_location_1',
       'recruitment_contact', 'residency_requirement', 'posting_date',
       'post_until', 'posting_updated', 'process_date'],
      dtype='object')

In [74]:
df_merged = drop_duplicates(df_merged,'job_id')

In [77]:
columns = ['job_id', 'posting_type', 'civil_service_title', 'title_classification', 
'title_code_no', 'full-time/part-time_indicator', 'work_location', 'division/work_unit',
'job_description', 'minimum_qual_requirements', 'additional_information', 'to_apply', 
'hours/shift', 'work_location_1', 'recruitment_contact', 'residency_requirement', 'posting_updated',
'process_date']

df_merged.drop(columns=columns, axis=1, inplace=True, errors="ignore")

# df_merged = drop_irrelevant_columns(df_merged, columns)

In [78]:
display(df_merged.columns)

Index(['agency', '#_of_positions', 'business_title', 'level', 'job_category',
       'career_level', 'salary_range_from', 'salary_range_to',
       'salary_frequency', 'preferred_skills', 'posting_date', 'post_until'],
      dtype='object')

In [47]:
display(pd.DataFrame({
    "non_nulls": df_merged.count(),
    "nulls": df_merged.isnull().sum(),
    "total": len(df_merged)
}))

Unnamed: 0,non_nulls,nulls,total
agency,5271,0,5271
#_of_positions,5271,0,5271
business_title,5271,0,5271
level,5271,0,5271
job_category,5271,0,5271
career_level,5271,0,5271
salary_range_from,5271,0,5271
salary_range_to,5271,0,5271
salary_frequency,5271,0,5271
preferred_skills,3338,1933,5271


In [79]:
cat_cols = list(df_merged.select_dtypes(include=['object']))
cat_cols

['agency',
 'business_title',
 'level',
 'job_category',
 'career_level',
 'salary_frequency',
 'preferred_skills',
 'posting_date',
 'post_until']

In [80]:
# cat_cols = list(df_merged.select_dtypes(include=['object']))
cat_cols = ['agency',
 'business_title',
 'level',
 'job_category',
 'career_level',
 'salary_frequency',
 'preferred_skills']
 
df_merged = remove_all_punctuation(df_merged,cat_cols)

In [81]:
df_merged['level'].nunique()

20

In [82]:
df_merged

Unnamed: 0,agency,#_of_positions,business_title,level,job_category,career_level,salary_range_from,salary_range_to,salary_frequency,preferred_skills,posting_date,post_until
0,DEPT OF ENVIRONMENT PROTECTION,2,Region Supervisor,00,Engineering Architecture Planning,Experienced nonmanager,53702.0,148745.0,Annual,Must have a valid PE License,06/08/2022,
1,DEPT OF ENVIRONMENT PROTECTION,1,2024BWS010Early Warning Remote Modeling Intern...,00,Engineering Architecture Planning,Student,16.0,16.0,Hourly,Student must currently be enrolled at a colle...,04/02/2024,11-JUN-2024
2,BRONX DISTRICT ATTORNEY,2,Intelligence Analyst,00,Communications Intergovernmental Affairs Tech...,Experienced nonmanager,60000.0,60000.0,Annual,,02/27/2024,26-FEB-2025
3,OFFICE OF CRIMINAL JUSTICE,1,Senior Coordinator Capital Projects,00,Engineering Architecture Planning,Experienced nonmanager,59116.0,80000.0,Annual,A baccalaureate degree from an accredited coll...,03/05/2024,29-MAY-2024
4,DEPT OF DESIGN CONSTRUCTION,1,Director,M2,Engineering Architecture Planning,Manager,64922.0,144066.0,Annual,Preference will be given to candidates with at...,11/06/2023,
...,...,...,...,...,...,...,...,...,...,...,...,...
10906,HRADEPT OF SOCIAL SERVICES,1,DEPUTY DIRECTOR OF ONBOARDING,01,Administration Human Resources,Experienced nonmanager,76301.0,85666.0,Annual,,10/18/2024,
10957,DEPT OF ENVIRONMENT PROTECTION,1,ENVIRONMENTAL SPECIALIST,03,Engineering Architecture Planning Public Safe...,Experienced nonmanager,95070.0,124935.0,Annual,,04/25/2025,
10961,OFFICE OF THE MAYOR,1,Deputy Communications Director,MY,Communications Intergovernmental Affairs,Manager,110000.0,145000.0,Annual,,05/29/2025,
11037,OFFICE OF THE MAYOR,1,Director for Digital Communications,MY,Communications Intergovernmental Affairs,Manager,130000.0,170000.0,Annual,,05/12/2025,


In [84]:
# df_merged_dates = standardize_dates(df_merged,'post_until')
# df_merged_dates

df_merged_dates = df_merged.copy()
# Convert 'post_until' to datetime (with your format)
df_merged_dates['post_until'] = pd.to_datetime(df_merged_dates['post_until'], format="%d-%b-%Y", errors='coerce')
# Format 'post_until' as string in 'dd-mm-YYYY'
df_merged_dates['post_until'] = df_merged_dates['post_until'].dt.strftime('%d-%m-%Y')

df_merged_dates

Unnamed: 0,agency,#_of_positions,business_title,level,job_category,career_level,salary_range_from,salary_range_to,salary_frequency,preferred_skills,posting_date,post_until
0,DEPT OF ENVIRONMENT PROTECTION,2,Region Supervisor,00,Engineering Architecture Planning,Experienced nonmanager,53702.0,148745.0,Annual,Must have a valid PE License,06/08/2022,
1,DEPT OF ENVIRONMENT PROTECTION,1,2024BWS010Early Warning Remote Modeling Intern...,00,Engineering Architecture Planning,Student,16.0,16.0,Hourly,Student must currently be enrolled at a colle...,04/02/2024,11-06-2024
2,BRONX DISTRICT ATTORNEY,2,Intelligence Analyst,00,Communications Intergovernmental Affairs Tech...,Experienced nonmanager,60000.0,60000.0,Annual,,02/27/2024,26-02-2025
3,OFFICE OF CRIMINAL JUSTICE,1,Senior Coordinator Capital Projects,00,Engineering Architecture Planning,Experienced nonmanager,59116.0,80000.0,Annual,A baccalaureate degree from an accredited coll...,03/05/2024,29-05-2024
4,DEPT OF DESIGN CONSTRUCTION,1,Director,M2,Engineering Architecture Planning,Manager,64922.0,144066.0,Annual,Preference will be given to candidates with at...,11/06/2023,
...,...,...,...,...,...,...,...,...,...,...,...,...
10906,HRADEPT OF SOCIAL SERVICES,1,DEPUTY DIRECTOR OF ONBOARDING,01,Administration Human Resources,Experienced nonmanager,76301.0,85666.0,Annual,,10/18/2024,
10957,DEPT OF ENVIRONMENT PROTECTION,1,ENVIRONMENTAL SPECIALIST,03,Engineering Architecture Planning Public Safe...,Experienced nonmanager,95070.0,124935.0,Annual,,04/25/2025,
10961,OFFICE OF THE MAYOR,1,Deputy Communications Director,MY,Communications Intergovernmental Affairs,Manager,110000.0,145000.0,Annual,,05/29/2025,
11037,OFFICE OF THE MAYOR,1,Director for Digital Communications,MY,Communications Intergovernmental Affairs,Manager,130000.0,170000.0,Annual,,05/12/2025,


In [63]:
regex_pattern = r"\bdata analyst\w*\b"
df_data_analyst = filter_by_regex_pattern(df_merged,'business_title', regex_pattern)

In [65]:
df_data_analyst.head(10)

Unnamed: 0,agency,#_of_positions,business_title,level,job_category,career_level,salary_range_from,salary_range_to,salary_frequency,preferred_skills,posting_date,post_until
0,DEPT OF HEALTHMENTAL HYGIENE,1,Cybersecurity Senior Data Analyst Audit Servic...,2,Health Technology Data Innovation,Experienced nonmanager,78795.0,110000.0,Annual,A baccalaureate BABS degree from an accredit...,1102024,09MAY2024
1,DEPT OF HEALTHMENTAL HYGIENE,1,Data Analyst Family and Youth Peer Support Pro...,2,Constituent Services Community Programs Healt...,Experienced nonmanager,82506.0,94882.0,Annual,Strong competence in data analysis and applica...,9202023,18MAY2024
2,DEPT OF HEALTHMENTAL HYGIENE,1,Data Analyst Bureau of Tuberculosis Control,1,Constituent Services Community Programs Healt...,Experienced nonmanager,70087.0,70087.0,Annual,,2152024,14JUN2024
3,DEPARTMENT OF INVESTIGATION,1,Associate Data Analyst,0,Communications Intergovernmental Affairs Poli...,Experienced nonmanager,85000.0,96000.0,Annual,Masters degree or advanced coursework in compu...,9152023,11JUL2024
4,DEPT OF HEALTHMENTAL HYGIENE,1,Data Analyst Bureau of Vital Statistics,2,Health Policy Research Analysis,Experienced nonmanager,82506.0,82506.0,Annual,A masters degree or doctorate from an accredi...,1112024,10MAY2024
5,OFFICE OF EMERGENCY MANAGEMENT,1,PROCESS AND DATA ANALYST HUMAN CAPITAL,1,Administration Human Resources Technology Dat...,Experienced nonmanager,65000.0,72500.0,Annual,Strong communication and writing skills and ab...,4152024,26MAY2024
6,DEPT OF HEALTHMENTAL HYGIENE,1,Data Analyst Bureau of Tuberculosis Control,2,Constituent Services Community Programs Healt...,Experienced nonmanager,68900.0,68900.0,Annual,,3062024,04JUL2024
7,NYC HOUSING AUTHORITY,1,Senior Business Intelligence Data Analyst,2,Technology Data Innovation,Experienced nonmanager,93288.0,120190.0,Annual,Experience developing implementing and maintai...,8022023,
8,DEPT OF HEALTHMENTAL HYGIENE,1,Data Analyst Bureau of Environmental Sciences ...,1,Health Policy Research Analysis,Experienced nonmanager,70087.0,70087.0,Annual,Experience with datamanagement systems explora...,3262024,24JUL2024
9,DEPT OF ENVIRONMENT PROTECTION,1,Data Analyst,0,Policy Research Analysis,Experienced nonmanager,84451.0,113550.0,Annual,,5122023,


In [None]:
regex_pattern = r"(sql|tableau|bi|phyton|eda|llm|ai|ml|pandas|NumPy|Agile)"
df_keywords = filter_by_regex_pattern(df_merged,'preferred_skills', regex_pattern)

df_keywords

# matches = merged[merged["Preferred_Skills"].str.contains(regex_pattern, case=False, na=False, regex=True)]

  mask = df[column].str.contains(regex_pattern, flags=re.IGNORECASE, na=False, regex=True)


In [None]:
merged.to_csv(config['output_data']['file1'], index=False, sep=";", encoding="utf-8")

In [None]:
df_keywords[['Agency', 'Business_Title', 'Job_Category', 'Career_Level', 'Preferred_Skills']] = df_keywords[['Agency', 'Business_Title', 'Job_Category', 'Career_Level', 'Preferred_Skills']].map(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x) if isinstance(x, str) else x)
df_keywords

In [None]:
regex_pattern = r"(sql|tableau|bi|phyton|eda|llm|ai|ml|pandas|NumPy|Agile)"
# Apply re.findall to each row in the 'Preferred_Skills' column
keyword_matches = merged['Preferred_Skills'].apply(lambda x: re.findall(regex_pattern, x, flags=re.IGNORECASE) if isinstance(x, str) else [])

# Filter rows where at least one keyword was found
df_keywords = merged[keyword_matches.apply(lambda matches: len(matches) > 0)].copy().reset_index(drop=True)