## Data Validator
Notebook that reads in old scraped job advertisement data and checks for missing information.

If there is missing informtion, this notebook has the functionality to fill it in.

In [15]:
import glob
import os
import requests
import re
import json

from bs4 import BeautifulSoup
from lxml import etree

import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
# This information got a bit mangled in Cameron's original code, so we'll create a extra 
# variable to hold these strings in case they change in the future.
date_posted_str= "Date Posted"
job_url_str = "Job URL"

data_file_directory = "../data"

In [3]:
# TODO better to move all this stuff into its own module and import it.

In [4]:
def get_csv_files(data_directory):
    r"""Gets the path to all data files in directory
    

    Parameters
    ----------
    data_directory : str
        Path to the directory that contains the input .csv files.

    Returns
    -------
    List
        List of strings. Filepaths to the desired .csv files.
    """
    csv_file_path = os.path.join(data_directory, "*.csv")
    data_file_paths = glob.glob(csv_file_path)
    return data_file_paths

In [5]:
def get_urls_missing_date_posted(df):
    r"""Gets urls for jobs with missing date posted data.

    Reads in DataFrame and looks for missing date posted information, then returns a 
    DataFrame with all the urls to the jobs with missing date posted information.

    Parameters
    ----------
    df : DataFrame
        DataFrame containing a column of data with date posted information and a column
        of information with original request url information.
    
    Returns
    -------
    Series
        Pandas Series (with ID) of urls to pages where posted date was missed.
    """
    no_dates = df[df[date_posted_str].isna()]
    urls = no_dates.loc[:,job_url_str]
    return urls


In [6]:
def has_missing_dates(df):
    """Checks if DataFrame has missing date posted information.
    
    Paramters
    ---------
    df : DataFrame
        DataFrame containing a column of data with date posted information and a column
        of information with original request url information.
    
    Returns
    -------
    bool
        True if there are missing date posted values. False if not.
    """
    return  df[date_posted_str].isnull().sum() > 0

In [7]:
def get_page_content(url):
    r"""Gets page content from a url
    
    Parameters
    ----------
    url : str
        Url of page to get content of.

    Returns
    -------
    bytes
        Page content of Response object in bytes. 

        Parsing the bytes of the page content is suppored by BeautifulSoup.
    """
    request_object = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    return request_object.content
    

In [50]:
def get_date_posted(page_content):
    r"""Parses date posted information from page content.
    
    
    Parameters
    ----------
    page_content : str
        String of page HTML dom.

    Returns
    -------
    str
        The date posted information from the page content.
    """
    soup = BeautifulSoup(page_content, "html.parser")
    dom = etree.HTML(str(soup))
    date_xpath_selector = '/html/head/script[6]'
    date_element = dom.xpath(date_xpath_selector)

    # GoogleTagManager variable inside of a script tag.
    unclean_json = date_element[0].text    
    unclean_json = unclean_json.strip()

    regex = re.compile(r'{.*};',re.DOTALL);
    cleaned_json = regex.search(unclean_json)
    cleaned_json = cleaned_json.group(0)

    cleaned_json = json.loads(cleaned_json)
    target_key = "JobDatePosted"
    return cleaned_json[target_key]


In [52]:
url = "https://jobs.chronicle.com/job/37310998/mellon-chair-in-political-science-/"
content = get_page_content(url)
get_date_posted(content)

AttributeError: 'NoneType' object has no attribute 'group'

In [9]:
def standardize_date_string(date_posted_str):
    r"""Takes date posted string from page content and converts it to desired format.
    
    Converts date string to the form yyyy-mm-dd. Date string format may change overtime.
    
    Parameters
    ----------
    date_posted_str : str
        String of the date posted from the website content.

    Returns
    -------
    Timestamp
        Standardized date posted information.
    """
    return pd.to_datetime(date_posted_str, infer_datetime_format=True, format="%Y/%m/%d")

In [10]:
def get_date_posted_from_url(url):
    r"""Get standardized date posted time from a job url
    
    Parameters
    ----------
    url : str
        Url of page to get content of.

    Returns
    -------
    Timestamp
        Standardized date posted information for a job listing.
    """
    page_content = get_page_content(url)
    date = get_date_posted(page_content)
    return standardize_date_string(date)

In [11]:
def fill_in_missing_posting_dates_form_csv(csv_data, output_file_name, output_path):
    r"""Fills in missing date information for csv file.

    Fills in missing date posted information from `csv_data`, and saves
    updated file of same name as input to specified directory.

    Parameters
    ----------
    csv_data : DataFrame
        DataFrame containing a column of data with date posted information and a column
        of information with original request url information.

    output_file_name : str
        Filename to save file under

    output_path : str
        Output directory path to write the csv files to.
    
    """
    output_path = os.path.join(output_path, output_file_name)

    missing_posted_date_urls = get_urls_missing_date_posted(csv_data)

    # Converting Series to DataFrame so we can see a tqdm progress bar.
    missing_posted_date_urls_frame = missing_posted_date_urls.to_frame()

    posted_dates = missing_posted_date_urls_frame[job_url_str].progress_apply(lambda url: get_date_posted_from_url(url))
    csv_data[date_posted_str] = posted_dates
    csv_data.to_csv(output_path)

In [None]:
files = get_csv_files(data_file_directory)

for file in files:
    filename = os.path.basename(file)
    print("Processing {}".format(filename))

    data = pd.read_csv(file, index_col=0)

    if has_missing_dates(data):
        fill_in_missing_posting_dates_form_csv(data, filename, "../data_fixed_dates")
    else: 
        print("Skipped {}".format(filename))