In [1]:
import sys

folder_path = '../'
sys.path.append(folder_path)

In [2]:
from common.extract_urls import ExtractMissingPersonsUrls

In [4]:
extract = ExtractMissingPersonsUrls()

In [5]:
urls = extract.extract_case_urls()

In [8]:
from typing import List, Dict

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import Select, WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import string
from datetime import datetime



class BuildRawData:

    def __init__(self, driver):
        self.driver = driver

    def build_raw_case_dict(self) -> Dict[str, str]:

        case_date_dict = {}

        # Find the case number
        case_number = WebDriverWait(
            self.driver, 5).until(
                ec.visibility_of_any_elements_located((By.CLASS_NAME, 'PageTitle')))[0].text
        case_date_dict["CASE_NUMBER"] = case_number


        # Find and extract case data
        case_data_raw = WebDriverWait(
            self.driver, 5).until(
                ec.visibility_of_any_elements_located((By.CLASS_NAME, 'CaseData')))[0]

        # All row entries are broken down into divs
        # These divs are split into further divs consisting of 'keys' and 'values'
        case_data_rows = WebDriverWait(
            case_data_raw, 5).until(
                ec.visibility_of_any_elements_located((By.CLASS_NAME, 'Entry')))

        for entry in case_data_rows:
            key_ = WebDriverWait(
                entry, 5).until(
                    ec.visibility_of_any_elements_located((By.CLASS_NAME, 'Key')))[0].text

            value_ = WebDriverWait(
                entry, 5).until(
                    ec.visibility_of_any_elements_located((By.CLASS_NAME, 'Value')))[0].text

            case_date_dict[key_.upper().replace(" ", "_")] = value_

        # Extract location information
        location_banner = WebDriverWait(
            self.driver, 5).until(
                ec.visibility_of_any_elements_located((By.CLASS_NAME, 'CaseMap')))[0]
        
        try:
            road = WebDriverWait(
                location_banner, 5).until(
                    ec.visibility_of_any_elements_located((By.CLASS_NAME, 'Road')))[0].text.lower()
            case_date_dict['LOCATION_ROAD'] = road
        except Exception as e:
            case_date_dict['LOCATION_ROAD'] = None
            
        try:
            county = WebDriverWait(
                location_banner, 5).until(
                    ec.visibility_of_any_elements_located((By.CLASS_NAME, 'County')))[0].text.lower()
            case_date_dict['LOCATION_COUNTY'] = county
        except Exception as e:
            case_date_dict['LOCATION_COUNTY'] = None

        try:
            country = WebDriverWait(
                location_banner, 5).until(
                    ec.visibility_of_any_elements_located((By.CLASS_NAME, 'Country')))[0].text.lower()
            case_date_dict['LOCATION_COUNTRY'] = country
        except Exception as e:
            case_date_dict['LOCATION_COUNTRY'] = None

        # Attempt to find who found them
        try:
            finders = WebDriverWait(
                location_banner, 5).until(
                    ec.visibility_of_any_elements_located((By.TAG_NAME, 'strong')))[0].text.lower()
            case_date_dict['FINDING_PARTY'] = finders
        except Exception as e:
            case_date_dict['FINDING_PARTY'] = None

        return case_date_dict
    


class EngineerRawData(ABC):

    """
    This class houses several methods to clean the extracted raw data.
    """

    @staticmethod
    def clean_case_number(case_number_str: str) -> str:
        """
        Cleans the case number to return a serialised code.
        Args:
            case_number_str: string of the case number scraped from site
        Returns:
            Formatted case number string
        """
        return ''.join(char for char in case_number_str if char in string.digits + string.punctuation)

    @staticmethod
    def split_age_range(age_range_str: str) -> Dict[str, int]:
        """
        Splits the age range into a minimum and maximum integar value.
        Args:
            age_range_str: String of the age range
        Returns:
            Dictionary of the min and maximum age values in the folm of {'MIN_AGE': x, 'MAX_AGE': y}
        """
        split_values = [age for age in '18 - 30'.split() if age.isdigit()]
        return {"MIN_AGE": min(split_values), "MAX_AGE": max(split_values)}

    @staticmethod
    def extract_height_cm(height_str: str) -> int:
        """
        Returns the height in cm.
        Args:
            height_str: String value of the height (typially containing cm and ft/inches)
        Returns:
            Integar of the height in cm
        """
        return height_str[:height_str.find('cm')]

    @staticmethod
    def format_date_found(date_found: str) -> datetime.date:
        """
        Formats a string date into a date object
        Args:
            date_found: string for the date found, typically in the "%d %B %Y" format
        Returns:
            Date object
        """
        date_format = "%d %B %Y"
        return datetime.strptime(date_found, date_format).date()

    @staticmethod
    def rough_format_distinguishing_features(dist_features_str: str) -> Dict[str, str]:
        """
        One of the more complex engineering tasks and requires a closer analysis.
        We separate this field into a list where possible.

        Typically, for distingusihing features these come as the following (split by a '-'):

            - CATEGORY: Represents the general category of the entry.
            - SUBCATEGORY: Provides additional details about the entry if available.
            - LOCATION: Indicates the specific body part or location mentioned in the entry.
            - DESCRIPTION: Describes any additional details or information related to the entry.

        Although this is not always the case, often the 'Description' field is missing.
        In some cases this structure is not adhered to whatsoever and it is just a free text input.
        
        """
        return_dict = {}

        # Split the entry string into a list by the new line character
        descriptors_list = [i for i in dist_features_str.split('\n') if len(i) > 1]

        for n, entry in enumerate(descriptors_list, start=1):
            # Attempt to split into the categories
            category_items = [i.strip().upper() for i in entry.split('-') if len(i) > 1]

            if len(category_items) < 2:
                continue

            category_dict = {
                "CATEGORY": None,
                "SUBCATEGORY": None,
                "LOCATION": None,
                "DESCRIPTION": None,
            }

            for j, key in zip(category_items, category_dict.keys()):
                category_dict[key] = j

            return_dict[f"FEATURE_{n}"] = category_dict

        return return_dict

    @staticmethod
    def rough_format_clothing(clothing_str: str) -> Dict[str, str]:
        """
        One of the more complex engineering tasks and requires a closer analysis.
        We separate this field into a list where possible.

        Typically, for clothing these come as the following (split by a '-'):

            - CATEGORY: Represents the general category type of the entry (footwear etc).
            - SUBCATEGORY: Provides additional details about the entry if available (shoes if CATEGORY is footwear for example).
            - COLOUR: Colour of the CATEGORY/SUBCATEGORY.
            - PATTERN: Describes the pattern of the entry.
            - DESCRIPTION: Describes any additional details or information related to the entry. 

        Although this is not always the case, often the 'Description' field is missing.
        In some cases this structure is not adhered to whatsoever and it is just a free text input.
        """
        return_dict = {}

        # Split the entry string into a list by the new line character
        descriptors_list = [i for i in clothing_str.split('\n') if len(i) > 1]

        for n, entry in enumerate(descriptors_list, start=1):
            # Attempt to split into the categories
            category_items = [i.strip().upper() for i in entry.split('-') if len(i) > 1]

            if len(category_items) < 2:
                continue

            category_dict = {
                "CATEGORY": None,
                "SUBCATEGORY": None,
                "COLOUR": None,
                "PATTERN": None,
                "DESCRIPTION": None,
            }

            for j, key in zip(category_items, category_dict.keys()):
                category_dict[key] = j

            return_dict[f"CLOTHING_{n}"] = category_dict

        return return_dict

    @staticmethod
    def format_possessions(possessions_str: str) -> List[str]:

        split_char = ','
        if '(1)' in possessions_str:
            split_char = '(1)'

        return [i.strip().lower() for i in possessions_str.split(split_char) if len(i) > 1]