unstructured/cleaners/extract.py

import datetime
import re
from typing import List, Optional

from unstructured.nlp.patterns import (
    EMAIL_ADDRESS_PATTERN,
    EMAIL_DATETIMETZ_PATTERN,
    IP_ADDRESS_NAME_PATTERN,
    IP_ADDRESS_PATTERN_RE,
    MAPI_ID_PATTERN,
    US_PHONE_NUMBERS_RE,
)


def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match:
    if not isinstance(index, int) or index < 0:
        raise ValueError(f"The index is {index}. Index must be a non-negative integer.")

    regex_match = None
    for i, result in enumerate(re.finditer(pattern, text)):
        if i == index:
            regex_match = result

    if regex_match is None:
        raise ValueError(f"Result with index {index} was not found. The largest index was {i}.")

    return regex_match


def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
    """Extracts texts that occurs before the specified pattern. By default, it will use
    the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
    index.

    Input
    -----
    strip: If True, removes trailing whitespace from the extracted string
    """
    regex_match = _get_indexed_match(text, pattern, index)
    start, _ = regex_match.span()
    before_text = text[:start]
    return before_text.rstrip() if strip else before_text


def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
    """Extracts texts that occurs before the specified pattern. By default, it will use
    the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
    index.

    Input
    -----
    strip: If True, removes leading whitespace from the extracted string
    """
    regex_match = _get_indexed_match(text, pattern, index)
    _, end = regex_match.span()
    before_text = text[end:]
    return before_text.lstrip() if strip else before_text


def extract_email_address(text: str) -> List[str]:
    return re.findall(EMAIL_ADDRESS_PATTERN, text.lower())


def extract_ip_address(text: str) -> List[str]:
    return re.findall(IP_ADDRESS_PATTERN_RE, text)


def extract_ip_address_name(text: str) -> List[str]:
    return re.findall(IP_ADDRESS_NAME_PATTERN, text)


def extract_mapi_id(text: str) -> List[str]:
    mapi_ids = re.findall(MAPI_ID_PATTERN, text)
    mapi_ids = [mid.replace(";", "") for mid in mapi_ids]
    return mapi_ids


def extract_datetimetz(text: str) -> Optional[datetime.datetime]:
    date_extractions = re.findall(EMAIL_DATETIMETZ_PATTERN, text)
    if len(date_extractions) > 0:
        return datetime.datetime.strptime(date_extractions[0], "%a, %d %b %Y %H:%M:%S %z")
    else:
        return None


def extract_us_phone_number(text: str):
    """Extracts a US phone number from a section of text that includes a phone number. If there
    is no phone number present, the result will be an empty string.

    Example
    -------
    extract_phone_number("Phone Number: 215-867-5309") -> "215-867-5309"
    """
    regex_match = US_PHONE_NUMBERS_RE.search(text)
    if regex_match is None:
        return ""

    start, end = regex_match.span()
    phone_number = text[start:end]
    return phone_number.strip()


def extract_ordered_bullets(text) -> tuple:
    """Extracts the start of bulleted text sections bullets
    accounting numeric and alphanumeric types.

    Output
    -----
    tuple(section, sub_section, sub_sub_section): Each bullet partition
    is a string or None if not present.

    Example
    -------
    This is a very important point -> (None, None, None)
    1.1 This is a very important point -> ("1", "1", None)
    a.1 This is a very important point -> ("a", "1", None)
    """
    a, b, c, temp = None, None, None, None
    text_sp = text.split()
    if any(["." not in text_sp[0], ".." in text_sp[0]]):
        return a, b, c

    bullet = re.split(pattern=r"[\.]", string=text_sp[0])
    if not bullet[-1]:
        del bullet[-1]

    if len(bullet[0]) > 2:
        return a, b, c

    a, *temp = bullet
    if temp:
        try:
            b, c, *_ = temp
        except ValueError:
            b = temp
        b = "".join(b)
        c = "".join(c) if c else None
    return a, b, c