#### Get the vaccinate.js data from this link: https://drive.google.com/drive/folders/131FLoQgRzRz-jIEkfmHNgrvTjlNMUdty

In [1]:
import json
import re

In [2]:
# Function to clean HTML tags from a string and ensure proper spacing


def cleanHTML(html):
    # Replace <span class= with )
    html = re.sub(r"<span class=", "school-based vaccination programme", html)

    # Replace <br> or <br/> with a newline for better readability
    html = re.sub(r"<br\s*/?>", "\n", html)

    # Remove remaining HTML tags
    html = re.sub(r"</?[^>]+>", "", html)

    # Replace multiple newlines with a single newline
    html = re.sub(r"\n+", "\n", html)

    # Ensure proper handling of special characters
    html = html.replace("â€™", "'")  # Replacing incorrectly decoded apostrophes
    html = html.replace("â€“", "–")  # Replacing en-dash
    html = html.replace("â€œ", '"').replace("â€", '"')

    # Fix common encoding issues (if needed)
    html = html.replace("Ã¢â‚¬â„¢", "'")  # Example replacement; add more as needed

    # Ensure all apostrophes and special characters are handled correctly
    html = re.sub(r"\\", "", html)  # Remove unnecessary escape characters
    html = re.sub(r"\s+", " ", html).strip()  # Remove extra spaces and trim the result

    # Ensure no double quotes where apostrophes should be
    html = html.replace('"s', "'s")  # If "it's" became it"s

    return html

In [3]:
# Read the JavaScript file content
js_file_path = "vaccinate.js"  # Replace with your actual file path
with open(js_file_path) as file:
    js_content = file.read()

In [4]:
# Extract 'ageData', 'adultData', and 'childData' using regex
age_data_pattern = re.compile(r"ageData\s*=\s*{(.*?)};", re.DOTALL)
adult_data_pattern = re.compile(r"adultData\s*=\s*\[(.*?)\];", re.DOTALL)
child_data_pattern = re.compile(r"childData\s*=\s*\[(.*?)\];", re.DOTALL)

age_data_match = age_data_pattern.search(js_content)
adult_data_match = adult_data_pattern.search(js_content)
child_data_match = child_data_pattern.search(js_content)

age_data_content = age_data_match.group(1) if age_data_match else ""
adult_data_content = adult_data_match.group(1) if adult_data_match else ""
child_data_content = child_data_match.group(1) if child_data_match else ""

In [5]:
# Helper function to map gender abbreviations to full form
def map_gender(gender):
    if gender == "MF":
        return "Males and Females"
    elif gender == "F":
        return "Females"
    elif gender == "M":
        return "Males"
    return "Unknown"

In [6]:
age_data_dict = {
    "birth": "Birth",
    "2m": "2 months",
    "4m": "4 months",
    "6m": "6 months",
    "12m": "12 months",
    "15m": "15 months",
    "18m": "18 months",
    "59m": "Influenza: 6 months - 4 years old",
    "10y": "10 - 11 years old",
    "12y": "12 - 13 years old",
    "13y": "13 - 14 years old",
    "2y": "Pneumococcal Disease: 2 years old - 17 years old",
    "5y": "Influenza: 5 years old - 17 years old",
    "18y": "18 - 26 years old",
    "27y": "27 - 64 years old",
    "65y": "65 years and above",
}

# Process 'adultData' and 'childData' content


def process_data(data_content):
    extracted_data = []

    vaccine_data_pattern = re.compile(r"\{[^{}]*\}", re.DOTALL)
    vaccine_entries = vaccine_data_pattern.findall(data_content)

    for entry in vaccine_entries:
        # Make a copy of the entry to avoid overwriting the original
        cleaned_entry = entry.replace("'", '"')  # Work with the copy

        age_group_match = re.search(r'ageGroup\s*:\s*"([^"]+)"', cleaned_entry)
        disease_name_match = re.search(r'diseaseName\s*:\s*"([^"]+)"', cleaned_entry)
        impact_match = re.search(
            r'impact\s*:\s*"((?:\\.|[^"\\])*)"', cleaned_entry, re.DOTALL
        )
        doses_match = re.search(r'doses\s*:\s*"(.*?)"', cleaned_entry)
        recommended_group_match = re.search(
            r'recommendedGroup\s*:\s*"(.*?)"', cleaned_entry, re.DOTALL
        )
        gender_match = re.search(r'gender\s*:\s*"([^"]+)"', cleaned_entry)

        if (
            age_group_match
            and disease_name_match
            and impact_match
            and doses_match
            and gender_match
        ):
            age_group = age_group_match.group(1)
            age_group_value = age_data_dict.get(age_group, age_group)
            extracted_data.append(
                {
                    "ageGroup": age_group_value,
                    "disease": cleanHTML(disease_name_match.group(1)),
                    "impact": cleanHTML(impact_match.group(1)),
                    "doses": cleanHTML(doses_match.group(1)),
                    "recommendedGroup": (
                        cleanHTML(recommended_group_match.group(1))
                        if recommended_group_match
                        else "Not Specified"
                    ),
                    "gender": map_gender(gender_match.group(1)),
                }
            )

    return extracted_data

In [7]:
# Add compulsory vaccination info for certain age groups
def add_compulsory_vaccination(age_data_content):
    compulsory_vaccinations = []

    compulsory_ages = [
        "3 months",
        "4 months",
        "5 months",
        "12 months",
        "15 months",
        "18 months",
        "10 - 11 years old",
    ]
    for age_group in compulsory_ages:
        if age_group in age_data_content:
            compulsory_vaccinations.append(
                {
                    "ageGroup": age_group,
                    "info": "Measles and diphtheria vaccinations are compulsory by law.",
                }
            )

    if "2 months" in age_data_content:
        compulsory_vaccinations.append(
            {
                "ageGroup": "2 months",
                "info": "Measles and diphtheria vaccinations are compulsory by law.",
            }
        )

    return compulsory_vaccinations

In [8]:
# Extract adult and child data
adult_data = process_data(adult_data_content)
child_data = process_data(child_data_content)
compulsory_vaccinations = add_compulsory_vaccination(age_data_content)

# Combine adult and child data
combined_data = adult_data + child_data

In [9]:
# Function to map age group to the sorted order based on age_data_dict
def age_sort_key(entry):
    # Find the key in age_data_dict that matches the 'ageGroup' value
    age_group = entry["ageGroup"]
    # Get the dictionary key corresponding to the value in 'age_data_dict'
    age_key = next(
        (key for key, value in age_data_dict.items() if value == age_group), None
    )

    # If the age_group exists in the dictionary, return the index of that key
    # Otherwise, return a high number to sort it at the end
    return list(age_data_dict.keys()).index(age_key) if age_key else float("inf")


# Sort combined data using only the age_sort_key (ignoring gender)
combined_data_sorted = sorted(combined_data, key=age_sort_key)

In [10]:
# Function to format the combined data into a single string (same as before)


def format_data(data):
    result = []
    for entry in data:
        result.append(f"Age Group: {entry['ageGroup']}\n")
        result.append(f"Disease: {entry['disease']}\n")
        result.append(f"Impact: {entry['impact']}\n")
        result.append(f"Doses: {entry['doses']}\n")
        result.append(f"Recommended for: {entry['recommendedGroup']}\n")
        result.append(f"Gender: {entry['gender']}\n")
        result.append("\n")
    return "".join(result)


# Formatting the compulsory vaccination info (same as before)


def format_compulsory_vaccinations(data):
    result = []
    for entry in data:
        result.append(f"Age Group: {entry['ageGroup']}\n")
        result.append(f"Info: {entry['info']}\n\n")
    return "".join(result)


# The updated text to add at the top
introductory_text = """This article provides detailed information about the vaccinations available to people based on their age and gender. Vaccines are a crucial part of maintaining health, protecting individuals from vaccine-preventable diseases such as influenza and pneumococcal disease. The content below outlines recommended vaccines for different age groups and genders, as well as information on compulsory vaccinations required by law. Stay one step ahead in protecting yourself and your loved ones by staying up-to-date with vaccinations tailored to your needs.\n\n"""

# Combine the introductory text and the formatted vaccination data
final_vaccination_content = introductory_text + format_data(combined_data_sorted)

In [11]:
data = [
    {
        "id": "1434610_content_js",
        "title": "STAY ONE STEP AHEAD WITH VACCINATIONS",
        "cover_image_url": "https://ch-api.healthhub.sg/api/public/content/41bb7a0fcb514904a3b0eae643495d45?v=8e3a9756",
        "full_url": "https://www.healthhub.sg/programmes/vaccinate",
        "content_category": "programs",
        "category_description": "Protect yourself and your loved ones from vaccine-preventable diseases like influenza and pneumococcal disease. There is no reason why anyone should suffer from the serious complications vaccine-preventable diseases can cause. Your best defence against such diseases is getting vaccinated.",
        "pr_name": "Health Promotion Board",
        "date_modified": None,
        "content": final_vaccination_content,
    }
]

# Write the data to a JSON file
with open("1434610_content_js.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

# Write the content to a .txt file
with open("vaccinate.txt", "w") as txt_file:
    txt_file.write(final_vaccination_content)