<a href="https://colab.research.google.com/github/Sidd-Shanmuhavel/Job-Search-Optimization-using-NLP/blob/master/cleanPage_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install justext
!pip install unidecode



In [None]:
from collections import Counter
import lxml
import os
import string
import re

import html
import justext
from unidecode import unidecode

In [None]:
import urllib

In [None]:
def parse_html(page):
    """ Clean HTML tags for webpages
    """
    try:
        parts = justext.justext(page, justext.get_stoplist('English'))
    except lxml.etree.ParserError as e:
        print('Page empty')
        return ''
    except UnicodeDecodeError as e:
        print("Can't decode utf-8")
        return ''
    paragraphs = []
    for part in parts:
        if not part.is_boilerplate:
            paragraphs.append(part.text)
    return '\n\n'.join(paragraphs)

In [None]:
def remove_non_alphanumeric(txt):
    """ Remove all non-alphanumeric characters, except space, from the text
    """
    return re.sub(r'[^a-zA-Z0-9 .]+', ' ', txt)


def remove_non_alpha(txt):
    """ Remove all non-alphabetical characters, except space, from the text
    """
    return re.sub(r'[^a-zA-Z ]+', '', txt)


def transliterate(txt):
    """ Transliterate foreign characters into its Latin spelling.
    For example, '\u5317\u4EB0' will be transliterated to 'Bei Jing'
    """
    return unidecode(txt)

In [None]:
def collapse_white_spaces(txt):
    """Collapse multiple white spaces into one white space
    """
    clean_txt = ''
    prev = None
    for c in txt:
        if c == ' ' and prev == ' ':
            continue
        else:
            clean_txt += c
        prev = c
    return clean_txt

In [None]:
def connect_lines(txt, line_sep='\n'):
    """ This happens when you crawl text from a webpage and
    they have random breaking lines mid-sentence.
    This function is to connect those lines.
    Two consecutive lines are separated by line_sep.
    """
    lines = txt.split('\n')

    result, curr = '', ''
    for line in lines:
        line = line.strip()
        if not line:
            if curr:
                result += (curr + '\n')
            result += line_sep
            curr = ''
        else:
            curr += (line + ' ')

    return result + curr

In [None]:
def clean_page(page):
    try:
        page = page.decode('utf-8')
    except UnicodeDecodeError as e:
        print("Can't decode", e)

    page = page.strip()
    if not page:
        return ''
    txt = parse_html(page)
    txt = transliterate(txt)
    txt = html.unescape(txt)
    return txt

In [None]:
def find_unprintable(txt):
    """Find the list of unprintable character
    and return a Counter of them
    """
    printable = set(string.printable)
    unprintable = [c for c in txt if c not in printable]
    return Counter(unprintable)


def replace_unprintable(txt):
    """Replace non-printable characters with printable characters
    """
    printable = set(string.printable)
    lines = open(f'{dir_path}/unprintable_chars.txt', 'r').readlines()
    chars = {line.strip().split(':')[0]:
             line.strip().split(':')[1] for line in lines}
    return ''.join([c if c in printable else chars[c] for c in txt])

In [None]:
def download_page(link, context=None, timeout=None):
    """
    Return code, page
    0: successfully read (write to index)
    1: bad_url (write to bad_url)
    2: unicode error (write to non_ascii_urls)
    3. bad_connection_urls
    When code is not 0, return ''
    """
    try:
        req = urllib.request.Request(link)
    except ValueError as e:
        print(link, "doesn't exist.")
        return 1, ''
    except ConnectionResetError as e:
        print('ConnectionResetError', link)
        return 3, ''

    try:
        if timeout is not None:
            response = urllib.request.urlopen(
                req, context=context, timeout=timeout)
        else:
            response = urllib.request.urlopen(req, context=context)
    except UnicodeError as e:
        print('UnicodeError for', link)
        return 2, ''
    except (urllib.error.HTTPError) as e:
        print('Error {} for {}'.format(e.code, link))
        return 1, ''
    except urllib.error.URLError as e:
        print('URLError for', link)
        return 1, ''
    except http.client.HTTPException as e:
        print('HTTPException', link)
        return 1, ''
    except http.client.RemoteDisconnected as e:
        print('RemoteDisconnected', link)
        return 1, ''
    except (ConnectionError, socket.timeout) as e:
        print('ConnectionError or Timeout', link)
        return 3, ''

    try:
        page = response.read()
    except http.client.HTTPException as e:
        print('HTTPException', link)
        return 1, ''
    except (ConnectionError, socket.timeout) as e:
        print('ConnectionError or Timeout', link)
        return 3, ''
    return page

In [None]:
def callAll(page):
  page = clean_page(page)
  txt = parse_html(page)
  txt = remove_non_alphanumeric(txt)
  #txt = remove_non_alpha(txt)
  txt = transliterate(txt)
  txt = collapse_white_spaces(txt)
  txt = connect_lines(txt)
  return txt

In [None]:
link = 'https://job-openings.monster.ie/credit-risk-analyst-dublin-dublin-south-dublin-ie-adecco-retail/219418270'
page = download_page(link)

In [None]:
callAll(page)

'Credit Risk Analyst Dublin Description We are currently recruiting on behalf of our client for an Account Trading and Credit Analysis Lead. The successful candidate will be dealing directly with external clients be responsible for credit management and forecasting with previous manufacturing distributor commercial experience. Key Responsibilities Be a key point of contact for the customer and reseller base to ensure good flow of information for all stakeholders Be the owner of commercial detail oAnticipated revenues credit terms phasing and seasonality and work with insurers to secure requirements Have a good understanding the timing of specific deals is also essential Engaging directly with Resellers to confirm setups and gather additional information required Negotiating conditions of purchase or sale requesting gathering and interpreting financials building project Client interface to establish readiness to accept orders Reverting daily on queries to sales team Evaluate all availab