### Setup

In [96]:
%%capture
%load_ext kedro.ipython

In [97]:
%%capture
import logging
import os
import pickle

import pandas as pd
from dotenv import load_dotenv
from neo4j import GraphDatabase
from neo4j.exceptions import DriverError, Neo4jError
from sklearn.metrics import completeness_score, homogeneity_score, v_measure_score

load_dotenv(r"..\conf\local\.env")

In [98]:
# Parameters
MODEL_NAME: str = "all-mpnet-base-v2"
CONTRIBUTOR: str = "Health Promotion Board"

# specify content_category. input 'all' if running across all categories
CONTENT_CATEGORY: str = "live-healthy-articles"

# adjust accordingly
THRESHOLD: float = 0.7

In [99]:
INPUT_GROUNDTRUTH_PATH = os.path.join(
    "..",
    "data",
    "01_raw",
    "Synapxe Content Prioritisation - Live Healthy_020724.xlsx",
)

DATA_FOLDER_PATH = os.path.join(
    "..",
    "data",
    "07_model_output",
    f"{CONTENT_CATEGORY}",
)

INPUT_EMBEDDING_NEO4J_PATH = os.path.join(
    DATA_FOLDER_PATH,
    f"{CONTENT_CATEGORY}_{MODEL_NAME}_embeddings_neo4j.pkl",
)

OUTPUT_PREDICTED_CLUSTER_PATH = os.path.join(
    DATA_FOLDER_PATH,
    f"{CONTENT_CATEGORY}__{MODEL_NAME}_predicted_clusters.csv",
)

OUTPUT_CLUSTER_METRICS_PATH = os.path.join(
    DATA_FOLDER_PATH,
    f"{CONTENT_CATEGORY}_compiled_model_variation_metrics.csv",
)

NEO4J_FOLDER_PATH = os.path.join(
    DATA_FOLDER_PATH,
    "neo4j",
)

if not os.path.exists(NEO4J_FOLDER_PATH):
    os.makedirs(NEO4J_FOLDER_PATH)

## Load files

In [100]:
# Load merged_df data
merged_data_df = catalog.load("merged_data")  # noqa

# load ground truth data
ground_truth = pd.read_excel(INPUT_GROUNDTRUTH_PATH, sheet_name=2)
ground_truth = ground_truth[ground_truth["Owner"].str.contains(CONTRIBUTOR)]
ground_truth = ground_truth[["Page Title", "Combine Group ID", "URL"]]
ground_truth = ground_truth[ground_truth["Combine Group ID"].notna()]

# Extract id from merged_data_df to ground truth
ground_truth = pd.merge(
    ground_truth, merged_data_df, how="inner", left_on="URL", right_on="full_url"
)
ground_truth = ground_truth[["id", "Page Title", "URL", "Combine Group ID"]]
ground_truth.rename(columns={"Combine Group ID": "ground_truth_cluster"}, inplace=True)

print(ground_truth.shape)
ground_truth.head(2)

(184, 4)


Unnamed: 0,id,Page Title,URL,ground_truth_cluster
0,1442828,Getting ready for solids,https://www.healthhub.sg/live-healthy/baby-getting-ready-for-solids,1.0
1,1445136,Getting Your Baby Started on Solids,https://www.healthhub.sg/live-healthy/getting-baby-started-on-solids,1.0


In [101]:
# load embeddings file
with open(INPUT_EMBEDDING_NEO4J_PATH, "rb") as f:
    articles = pickle.load(f)

# merge with ground truth
articles_df = pd.merge(
    articles,
    ground_truth,
    how="inner",
    left_on="id",
    right_on="id",
)

vector_columns = [col for col in articles_df.columns if "vector" in col]
for col in vector_columns:
    articles_df[col] = articles_df[col].apply(lambda x: x.tolist())

print(articles_df.shape)
articles_df.head(2)

(176, 12)


Unnamed: 0,id,title,full_url,content,meta_description,vector_title,vector_article_category_names,vector_category_description,vector_extracted_content_body,Page Title,URL,ground_truth_cluster
0,1443987,All You Need to Know About Childhood Immunisations,https://www.healthhub.sg/live-healthy/all-you-need-to-know-about-vaccinations,"Every child in Singapore is vaccinated according to our National Childhood Immunisation Schedule.\nImmunisation is an important way to protect your child from certain diseases that can be avoided. The basis of vaccination is to make us immune to a disease before it has the chance to make us sick. Below are some answers to commonly asked questions about vaccines given to infants and toddlers.\nRelated: Information Is the Best Defence\n\nHow Do Vaccines Work?\nVaccines are made from the same viruses (or parts of them) that cause disease. However, in vaccines, the viruses are altered in such a way that they cannot cause illnesses. The vaccines are introduced into our bodies, usually via injection. Our immune system then reacts by making antibodies that give us immunity. While some vaccines give us immunity for life, others may require repeated booster shots to maintain immunity.\nRelated: How Healthy Are You?\n\nWhen Is It Not Suitable for My Baby to Be Vaccinated?\nMost babies are fit for immunisation, but if your baby is currently having a high fever or has had a previous serious reaction to certain vaccines, you may need to consult a doctor for further advice. Avoid live vaccines like MMR (measles, mumps and rubella) or polio if your childs immune system is weak as a result of illness or medical treatment.\n\nAre There Any Side Effects of Vaccines that I Should Be Aware Of?\nIts normal for your baby to cry a little after the injection, but this should not last longer than a few seconds. Your baby might feel a little sore, and there may be a slight redness and even a small lump over the injection site, which should all clear within a few days. Your baby might also have a slight fever for a day or two after the injection. Some parents are concerned about the link between the MMR vaccines and autism, however, there is no scientific evidence to support this belief. Do consult a doctor if you have any concerns.\nRelated: Your Baby Needs Soft Skills Too\n\nHow Can I Pay for These Vaccinations?\nFull subsidies* for vaccinations under the National Childhood Immunisation Schedule (NCIS) are available for Singaporean children at CHAS GP clinics and polyclinics.\n*Applicable for certain vaccine brands. Please refer to https://go.gov.sg/vaccine-list for the latest subsidised vaccine brands.\nRelated: Safe Home for Your Baby\n\nWhen Should My Baby Get Vaccinated?\nThe latest Child Vaccination Chart is available from the National Immunisation Registry. The Registry collects and maintains immunisation records of children and adults residing in Singapore. To view your child's immunisation records, log into HealthHub with your Singpass.\nThe immunisation programme protects children from 12 diseases such as tuberculosis (TB), Hepatitis B and Diphtheria. Learn all about these vaccines in 12 Essential Childhood Vaccinations.\nToday, vaccines are combined into a single injection. These 5-in-1 or 6-in-1 vaccines means fewer injections for children. To learn more about these, check out Immunisations for Your Baby.\nRead these next:\n- 12 Essential Childhood Vaccinations in Singapore\n- Important Nutrients: What Should You Eat More Of?\n- Give Your Child the Best Protection\n- Pre-pregnancy Vaccination\n- HealthHub.sg/vaccinate","To prevent diseases such as measles and mumps, babies are vaccinated according to the Childhood Immunisation Schedule from birth to 12-years-old.","[0.0075735547579824924, -0.047238513827323914, 0.016200540587306023, -0.004477768670767546, 0.044157832860946655, 0.002363123232498765, 0.02642083913087845, -0.03475649282336235, 0.021395983174443245, -0.012058322317898273, 0.009130816906690598, -0.018503854051232338, 0.05394560843706131, -0.02998342365026474, -0.032108522951602936, -0.0641198679804802, 0.01195488777011633, 0.013479501008987427, -0.04405410587787628, 0.02967112883925438, -0.017375821247696877, 0.021782878786325455, -0.025743769481778145, 0.025588199496269226, 0.027111580595374107, -0.02818635106086731, -0.0068747675977647305, 0.02173606865108013, 0.007656705565750599, -0.09411386400461197, 0.03204352781176567, -0.06299707293510437, -0.0040323007851839066, 0.026053383946418762, 1.6305181134157465e-06, 0.05474470555782318, 0.03761815279722214, -0.002179468981921673, 0.003135462524369359, -0.02245425432920456, 0.03758154436945915, -0.09797880053520203, 0.04811212047934532, -0.008881901390850544, 0.021311627700924873, -0.04419636353850365, 0.057162825018167496, 0.046982597559690475, 0.03732696920633316, -0.009155387058854103, 0.007170769385993481, 0.007974956184625626, 0.01987440511584282, 0.01137486007064581, 0.024339819326996803, -0.00857170857489109, -0.05596456304192543, 0.0397268570959568, 0.0252767875790596, -0.0006126003572717309, -0.03752797842025757, 0.006598940584808588, -0.026400623843073845, -0.07189838588237762, -0.0029449264984577894, 0.007910901680588722, 0.007659925147891045, -0.03261173516511917, -0.029812373220920563, 0.02931625209748745, -0.001121477340348065, -0.03198195621371269, 0.014721476472914219, 0.014762398786842823, -0.035157524049282074, -0.059335459023714066, 0.009203102439641953, -0.026125593110919, -0.015036684460937977, 0.010546933859586716, 0.03940609097480774, 0.012338291853666306, 0.00840751826763153, -0.007478365208953619, 0.020722730085253716, -0.014378671534359455, 0.009743528440594673, -0.010710615664720535, -0.05516098439693451, 0.02110018953680992, 0.05852193012833595, -0.03309554234147072, -0.01248470600694418, 0.057382192462682724, 0.0548514798283577, -0.02288966067135334, -0.0567592978477478, -0.0508909747004509, 0.020107805728912354, -0.015013305470347404, ...]","[0.06393319368362427, 0.023783838376402855, -0.012843099422752857, -0.02903273142874241, 0.047719500958919525, 0.03083522990345955, 0.04294506832957268, 0.045339737087488174, 0.019912876188755035, -0.006252221297472715, 0.07596531510353088, -0.01695534586906433, 0.03737439587712288, -0.0029328204691410065, -0.0033459439873695374, -0.035660285502672195, -0.022481147199869156, 0.05166501924395561, -0.007981554605066776, -0.013977766036987305, -0.04154433682560921, -0.006869387347251177, 0.007204792927950621, 0.02980751171708107, 0.008504471741616726, -0.04074965789914131, 0.04870033636689186, 0.001484990818426013, -0.0001798964658519253, -0.02906429022550583, 0.027214540168642998, -0.02647491917014122, -0.008955101482570171, -0.05178172513842583, 1.4724016637046589e-06, -0.0035780235193669796, -0.020601460710167885, -0.02518056519329548, -0.004840997979044914, 0.01804615929722786, 0.07830017805099487, -0.03408046439290047, 0.009715002030134201, 0.022802766412496567, -0.007093260530382395, -0.010881332680583, 0.04593123868107796, -0.007129086647182703, 0.0004513902822509408, 0.05479546636343002, -0.018989210948348045, 0.03879251703619957, -0.07968835532665253, 0.02442071959376335, 0.014148127287626266, -0.0078002228401601315, -0.005411621183156967, 0.002267872216179967, 0.050241466611623764, 0.0016266796737909317, -0.006876820232719183, -0.06711051613092422, -0.0469319149851799, -0.011363919824361801, 0.0412093847990036, 0.05925919488072395, 0.029094303026795387, -0.023355603218078613, 0.021213499829173088, -0.015037218108773232, -0.013236815109848976, 0.0420265793800354, 0.004154413938522339, 0.07778052985668182, -0.020887289196252823, -0.032809916883707047, -0.0477638803422451, -0.029639342799782753, -0.02181309647858143, 0.010083700530230999, -0.042869824916124344, 0.020483503118157387, 0.014315322041511536, 0.07441291213035583, 0.05296129733324051, 0.0024423946160823107, 0.003192251082509756, 0.019736934453248978, 0.0031552717555314302, -0.016156483441591263, 0.005872221197932959, -0.017335209995508194, -0.011460937559604645, 0.03957859426736832, 0.005881397519260645, -0.06047792732715607, 0.01496053021401167, -0.06195633113384247, 0.05851244181394577, 0.031027326360344887, ...]","[-0.018412871286273003, 0.008014391176402569, -0.0019386888016015291, -0.021190404891967773, 0.0017024849075824022, 0.050335656851530075, 0.019683483988046646, -0.025592198595404625, 0.012860356830060482, -0.05409735441207886, 0.06702644377946854, -0.016347361728549004, 0.04720110073685646, -0.03940638527274132, -0.07666042447090149, -0.07962194830179214, 0.014186290092766285, -0.029009444639086723, -0.03519979491829872, 0.02453727275133133, -0.03465275466442108, 0.05382109060883522, -0.045855358242988586, -0.022116661071777344, 0.0018598634051159024, 0.00608826195821166, -0.007958528585731983, -0.056997790932655334, 0.019052624702453613, -0.07768776267766953, 0.005090837832540274, -0.05530548840761185, 0.017457837238907814, -0.001463556196540594, 1.5174941836448852e-06, 0.024940352886915207, -0.0014664015034213662, 0.031272392719984055, 0.00020257674623280764, -0.04365257918834686, 0.016970086842775345, -0.06850635260343552, 0.029991943389177322, 0.014090163633227348, -0.015694985166192055, -0.02190464921295643, 0.04845279082655907, -0.01843254826962948, -0.0001620669791009277, 0.009533051401376724, 0.004301441367715597, 0.0033335420303046703, -0.015980428084731102, 0.02877289429306984, -0.03177167475223541, -0.06586357951164246, -0.041375573724508286, 0.039618100970983505, 0.02776084654033184, 0.0007588976295664907, -0.01276079285889864, 0.023394770920276642, -0.013625890016555786, -0.044897302985191345, -0.008736077696084976, -0.009780428372323513, 0.04383135959506035, -0.02170044742524624, -0.0234246626496315, 0.014112498611211777, 0.06201758608222008, -0.05284354090690613, 0.03560234233736992, -0.04149911180138588, -0.008631990291178226, -0.031181585043668747, -0.01290848571807146, 0.009514745324850082, -0.0033218618482351303, 0.028813663870096207, 0.07190336287021637, 0.04517741873860359, 0.04499831423163414, -0.0049418495036661625, 0.011638488620519638, 0.028892509639263153, 0.009047910571098328, -0.026692122220993042, -0.0341208279132843, 0.042151808738708496, 0.059169892221689224, 0.00357029284350574, -0.018880551680922508, 0.05037723481655121, 0.031631819903850555, 0.02681840769946575, -0.014027800410985947, -0.0740213617682457, 0.0041786436922848225, -0.032475005835294724, ...]","[0.02329820767045021, -0.026086032390594482, -0.006225403863936663, -0.04940876364707947, 0.05631823465228081, 0.04810352623462677, 0.026035819202661514, -0.0345376655459404, 0.0026250940281897783, 0.0019966112449765205, -0.05773895978927612, -0.03489990159869194, 0.032686278223991394, -0.015361296012997627, -0.04034901782870293, -0.04409102350473404, 0.03318726271390915, -0.020778276026248932, 0.015475211665034294, 0.015126626938581467, -0.005448642652481794, 0.02638457715511322, -0.002343709347769618, 0.03068138286471367, 0.009759734384715557, -0.015355472452938557, -0.040247850120067596, 0.03702850639820099, 0.008492395281791687, -0.12230827659368515, 0.026938442140817642, -0.018263984471559525, 0.03294067457318306, 0.012583766132593155, 2.15061891140067e-06, 0.04072536528110504, 0.013860374689102173, 0.020798491314053535, -0.0255030058324337, -0.01883597858250141, 0.028663866221904755, -0.06834675371646881, 0.033045727759599686, 0.02272946946322918, 0.02226768247783184, -0.07880565524101257, -0.015284501016139984, 0.024495389312505722, 0.00020313914865255356, -0.016202175989747047, -0.009930466301739216, -0.00285458005964756, -0.006979187019169331, 0.029186677187681198, -0.00023618247359991074, -0.013082267716526985, -0.02346470206975937, 0.018408508971333504, 0.04970741271972656, 0.018635213375091553, -0.04705410450696945, 0.014842972159385681, -0.0253843255341053, -0.033858057111501694, -0.00726576242595911, -0.026309890672564507, 0.0441305972635746, -0.015855878591537476, -0.01988207921385765, 0.019603848457336426, 0.060269325971603394, -0.030263738706707954, -0.004066340625286102, -0.01650175452232361, -0.02005591429769993, 0.0007007024250924587, 0.013365239836275578, 0.0032139639370143414, -0.016453947871923447, 0.00023886514827609062, 0.05722475051879883, 0.011765771545469761, -0.005938856862485409, -0.001542948535643518, 0.0327400267124176, -0.015105608850717545, 0.014621242880821228, -0.030876489356160164, -0.05647118389606476, -0.002320912666618824, 0.0722600668668747, -0.02748962864279747, -0.01688466966152191, 0.03361625596880913, -0.011912941001355648, 0.04332344979047775, -0.03954968601465225, -0.029836244881153107, -0.01776787079870701, -0.05184042826294899, ...]",All You Need to Know About Childhood Immunisations,https://www.healthhub.sg/live-healthy/all-you-need-to-know-about-vaccinations,16.0
1,1442828,Getting ready for solids,https://www.healthhub.sg/live-healthy/baby-getting-ready-for-solids,"Weaning Tips\nThe process of switching an infant from a milk-only diet to a mixed one that includes other solid food is called complementary feeding or weaning. Parents are recommended to introduce a good balance of solid food to their babies by 6 months of age.\n\nStarting on solids\nFrom 6 months of age, your baby is just about to learn how to swallow food. While milk should still be his staple, you can start by giving your child 3-5 baby spoonfuls of a single ingredient food.\nMost parents begin weaning their babies with iron-fortified rice cereals. These cereals are fortified with iron to help meet the babys increased need for dietary iron at this time.\nVegetables and fruit can also be included to provide vitamin C which enhances iron absorption.\nYou can also give him porridge blended with mashed or pureed vegetables like pumpkin, sweet potato and carrot. Introduce other cereals like wheat and mixed cereals when he is a little older. If your baby is eating well, gradually increase it to a meal. To see if your baby is eating well, look at his bowel movement, his weight and his height. By about 6 - 7 months, you can slowly introduce some protein food.\nIf your baby has a strong family history of allergy or has a personal history of other allergic problems, food allergy is more likely to occur. If you are concerned, consult your doctor.\nThe form and texture of each food should also vary with the age of your baby. Do not add sugar, salt and seasonings into the food. Salt cannot be added to baby's food till after 12 months as the kidneys may not be able to excrete the high salt load. Natural spices can be used in cooking to expose your child to a wide variety of tastes and flavours.\n\nVary the form and texture of food with your baby's age\n\nWhat and how much to feed\nStart your baby on solid food gradually. Use My Healthy Plate as a guide. There is no particular order for food introduction. However, most parents begin weaning their babies with plain iron-fortified rice cereal.\nIntroduce one new food every 3-4 days with the aim of giving your baby food from all the basic food groups eventually.\nThe table here shows the recommended number of servings per day from each food group for infants aged 6-12 months.\nStart with giving only 12 teaspoon of solids at first. Slowly increase the amount to 1-2 tablespoons of solids, 2-3 times a day. Prepare your child's food with no added salt or sugar. Oil may be recommended occasionally to ensure that the food has sufficient calorie density. Once your baby starts on solids, he may also need some extra fluids such as water.\n\nKnow the serving sizes\nThe table below will help you get familiar with serving sizes of the various food groups. This will help you in meal planning for your little one.\nNote:\n*All weights listed are for edible portions only.\n**Rice bowl ***250ml + 10-inch plate\n\nSample Daily Menus for your child\nThe sample daily menus below will give you an idea of the food you can prepare for your little one.\nWhen solids are first introduced, parents can feed your child just once a day. When eating of solids is more established, then work towards two meals per day, then three meals per day. Infants at 6-9 months may be taking only two meals of solids per day, rather than three meals.\n\n6 - 9 months old: smooth and lumpy food\n\n10 - 12 months old: mashed, chopped and cut food\n\nHow to introduce solids\n\nEstablishing a routine whilst allowing your baby to enjoy his food\nYour baby may be ready for solids but he may not want to eat as he is not used to it yet. You need to establish a routine for eating. Once he is used to it, the process becomes easier and enjoyable for your baby.\nHere are some tips on how to establish a routine:\n- Set a time for breakfast, lunch, dinner and snacks.\n- Seat your baby in the same place at mealtimes, preferably at the dining table. Put him in a high chair. Remember to pull the straps on the high chair firmly so that he cannot climb his way out.\n- Minimise distractions. Keep toys away and do not switch on the television.\n\nFeeding baby\n- Start your baby on a single-ingredient food. Give him only half a spoonful of the food. Bring the spoon towards his mouth and if he opens his mouth, place the food gently at the back of the tongue. Remove the spoon and see how he learns to swallow.\n- Give your baby sufficient time to finish his food. Do not force him if he is not hungry or is not interested. If he rejects the food, give him milk and try weaning again during the next meal. It may take your baby up to 8-10 times before he accepts a new food. Many babies have a tongue-thrusting reflex when trying new solids, but this does not mean that he does not like the food.\n- Use the above table on Recommended number of servings per day for infants 6-12 months to transit your baby towards eating regular meals and snacks from all the food groups in age-appropriate portions.\n- Teach him how to feed himself. Allow him to pick up food with his fingers or a spoon and to put it in his mouth. This will help develop his motor skills. Teach him to drink from a sipper cup. Do not be fussy about neatness during mealtimes.\n- Introduce one new food every 3-4 days.\n\nPrevent choking\n- Never leave your baby alone when he is eating. Always watch him to make sure he does not choke.\n- Ensure he sits upright and is not slouched over while eating.\n- Ensure that the food is properly pureed, mashed or scraped so that it is easy for your baby to swallow without choking. As he gets older, food should still be soft, but chunkier and textured to help him learn how to chew properly.\n\nAvoid:\n- Hard food such as nuts, raw carrots, apples. These should be grated or cooked until soft.\n- Food with small bones. Fish bones should be removed.\n- Small, round food such as grapes and berries. These should be cut into bite-size portions and served.\n\nFood preparation\n- Food hygiene in food preparation is very important.\n- Wash your hands before and after preparing your babys food.\n- Have a designated space in the kitchen for food preparation. Clean surfaces before and after preparing food.\n- Do not use cracked or chipped utensils (these have a higher chance of harbouring germs).\n- Do not mix raw food, especially meats, with cooked food. Meats, if used, must be cooked thoroughly.\n- If you re-heat food, make sure you bring it to full boil for a few minutes.\n- Discard unfinished food from your babys bowl.\n- Keep food storage areas pest-free.\n- Cover rubbish bins properly and empty them regularly.\nRead these next:\n- Baby's First Food Journey\n- Early Childhood Nutrition: Food Guide for Your One-Year- Old\n- Getting Your Baby Started on Solids",You have breastfed your baby for 6 months and now you want to start him on solids. Here is how you can do it.,"[0.035273078829050064, 0.03444071486592293, -0.004107040353119373, -0.05241933465003967, 0.02514740265905857, -0.0028136023320257664, -0.010584317147731781, 0.028238825500011444, -0.03407566249370575, 0.026174182072281837, 0.04946858063340187, -0.05051907151937485, 0.007633817382156849, 0.11561308801174164, 0.006623174529522657, -0.0037704859860241413, -0.0006488657090812922, 0.024336500093340874, 0.0044262344017624855, -0.018856868147850037, -0.017013484612107277, 0.0010487297549843788, -0.024845818057656288, 0.0003224400570616126, 0.026596704497933388, -0.02471878007054329, 0.0027932063676416874, 0.07262501120567322, -0.011392482556402683, -0.07863984256982803, 0.016799597069621086, 0.050070442259311676, 0.0359974205493927, -0.03481857851147652, 1.6278379462164594e-06, -0.004355960059911013, -0.01764957420527935, -0.017398282885551453, -0.08284863829612732, 0.05456199869513512, -0.01709236204624176, -0.14359314739704132, -0.005017195828258991, -0.008104631677269936, -0.02367379143834114, -0.05933917686343193, 0.013035982847213745, -0.003073664614930749, -0.021184034645557404, -0.028934556990861893, -0.02354445494711399, -0.009811832569539547, 0.0030338899232447147, 0.013706348836421967, 0.024611692875623703, 0.010562448762357235, -0.027084941044449806, -0.04544908180832863, 0.08159764111042023, 0.03699949011206627, -0.005449062678962946, 0.005677454639226198, -0.0011464629787951708, 0.025662677362561226, 0.08884356915950775, 0.05806223675608635, 0.04017069563269615, -0.010405318811535835, -0.021658720448613167, 0.02405519038438797, 0.02241959050297737, 0.031710073351860046, 0.007028349209576845, 0.030068110674619675, -0.044492099434137344, -0.030844222754240036, -0.01445007137954235, 0.015846239402890205, 0.010597462765872478, 0.010097598657011986, -0.04619058221578598, -0.04900491610169411, -0.015256871469318867, 0.03741253912448883, 0.0713505893945694, 0.03828456252813339, 0.001417880761437118, -0.04529271647334099, 0.05577321723103523, 0.018397873267531395, 0.009656536392867565, -0.061512529850006104, -0.007663557771593332, 0.03466236963868141, -0.013012352399528027, -0.0014413686003535986, 0.015284025110304356, 0.03577457740902901, 0.04600336775183678, -0.03344615548849106, ...]","[-0.0363435372710228, -0.0007167053408920765, -0.03663930669426918, -0.07951509207487106, 0.007119855377823114, 0.030288252979516983, 0.0235297791659832, 0.002700625918805599, 0.02432212233543396, -0.02302597649395466, 0.05539382994174957, -0.06925160437822342, 0.04654306545853615, -0.009864410385489464, -0.016926491633057594, -0.03738277032971382, -0.037570852786302567, 0.012569325976073742, -0.021213676780462265, 0.03974537178874016, -0.06658956408500671, 0.050493840128183365, 0.0012206496903672814, -0.019904186949133873, -0.008700870908796787, -0.019722020253539085, -0.03722471371293068, 0.025624891743063927, -0.003228622954338789, -0.09593093395233154, 0.057093385607004166, 0.03314949572086334, 0.01686735637485981, -0.010279804468154907, 1.3778969787381357e-06, 0.045457519590854645, -0.025443600490689278, 0.008764459751546383, -0.02983570657670498, -0.0007796916179358959, 0.03395501524209976, -0.07586159557104111, -0.03043554536998272, 0.0434628427028656, 0.001695354119874537, -0.04229481890797615, 0.04004506766796112, 0.053254760801792145, -0.04768328368663788, 0.0015650015557184815, 0.02117409184575081, 0.044081881642341614, -0.06694742292165756, 0.009925780817866325, 0.03815320506691933, -0.017937371507287025, -0.0296100452542305, 0.04758657142519951, 0.06105522811412811, -0.03083496168255806, 0.0001118947911891155, 0.03703480586409569, -0.011079752817749977, -0.014277066104114056, -0.038641057908535004, 0.009992770850658417, 0.04579129070043564, -0.10377702862024307, 0.01072043925523758, 0.05960890278220177, -0.025830866768956184, -0.02049741894006729, 0.02580018900334835, 0.07439697533845901, -0.05838361009955406, -0.005112602841109037, -0.022275032475590706, -0.010389362461864948, -0.02819308452308178, -0.005150799173861742, 0.03746248036623001, -0.007973569445312023, 0.017447059974074364, 0.049701761454343796, 0.06622952967882156, 0.014660038985311985, 0.03899793326854706, -0.03898819163441658, -0.01520939264446497, -0.001350674661807716, -0.008518070913851261, -0.00206900667399168, 0.0006015936378389597, 0.008716032840311527, -0.004110440146178007, 0.00601670378819108, -0.003794115036725998, -0.04446490854024887, 0.05383602902293205, -0.054278310388326645, ...]","[0.02195669896900654, 0.04195531830191612, 0.01300596073269844, -0.005547165405005217, 0.010729494504630566, -0.0012921529123559594, 0.008200700394809246, 0.010915378108620644, -0.00417732261121273, -0.04678603261709213, 0.012719279155135155, -0.068492092192173, 0.043439172208309174, 0.04413717985153198, -0.04951870068907738, 0.015977364033460617, 0.016983160749077797, 0.03925861418247223, 0.06364865601062775, -0.008135455660521984, 0.04121529683470726, 0.019279928877949715, 0.004238310735672712, -0.037855248898267746, -0.03157740458846092, 0.031699057668447495, -0.02114497497677803, 0.05202788859605789, -0.04877469688653946, -0.07863588631153107, 0.014786654151976109, 0.053766585886478424, 0.05638228729367256, -0.014240110293030739, 1.6491898122694693e-06, 0.04894480109214783, -0.021957358345389366, -0.01588253676891327, -0.00924544595181942, 0.012658847495913506, 0.016902456060051918, -0.03306760638952255, 0.030965622514486313, -0.03973357006907463, 0.018413208425045013, 0.05232078582048416, 0.008464123122394085, -0.010150451213121414, -2.6483903639018536e-05, -0.009053356014192104, -0.010280244052410126, -0.017664244398474693, -0.037632111459970474, 0.03803662210702896, -0.11121389269828796, 0.013220326974987984, -0.0017261381726711988, -0.01909825950860977, 0.0331607423722744, -0.018623413518071175, -0.06278743594884872, 0.004389330744743347, 0.018110038712620735, 0.06644058227539062, -0.016313087195158005, 0.0189821757376194, 0.03061869740486145, 0.01615777052938938, 0.02336811274290085, 0.017425252124667168, 0.02325556054711342, -0.017207050696015358, 0.018134132027626038, 0.01406487263739109, 0.01216551661491394, 0.08233971893787384, 0.052978210151195526, 0.01857842691242695, -0.031134676188230515, 0.004778330214321613, -0.01680063083767891, -0.08253888040781021, 0.004540837835520506, 0.0023034936748445034, 0.0344860665500164, 0.08654343336820602, 0.012402599677443504, -0.04407773166894913, 0.03181816264986992, 0.009521321393549442, 0.10441353917121887, 0.006465033628046513, -0.013731350190937519, 0.014742671512067318, -0.021778162568807602, 0.04799603298306465, -0.013252614066004753, 0.05812420696020126, 0.013403020799160004, -0.06263893097639084, ...]","[0.007204481866210699, 0.01930098980665207, -0.004990340210497379, -0.02450522780418396, -0.003940222319215536, -0.0005131206708028913, -0.03777999430894852, 0.023904502391815186, 0.02513831853866577, -0.025695204734802246, 0.04924542456865311, -0.0747402086853981, 0.05776812508702278, 0.03258844465017319, -0.06949673593044281, -0.023310698568820953, 0.026492321863770485, 0.038399990648031235, 0.009365088306367397, 0.03152960538864136, -0.021235311403870583, -0.01162682007998228, 0.02007363736629486, -0.008855091407895088, -0.04725335165858269, 0.01461466122418642, 0.0015920639270916581, 0.02320173569023609, -0.0008385960245504975, -0.10070858895778656, 0.026545101776719093, 0.04030848667025566, 0.0072418623603880405, 0.011929834261536598, 1.7502579794381745e-06, 0.03353495895862579, -0.0198894701898098, 0.012080991640686989, -0.04679163545370102, 0.03014647401869297, 0.007752752862870693, -0.09675462543964386, -0.003134415252134204, -0.003548020962625742, 0.027951836585998535, 0.0007206044974736869, 0.032855890691280365, 0.05097696930170059, -0.02745267190039158, -0.01743401773273945, 0.017615411430597305, -0.04101795703172684, -0.00855648797005415, 0.03544793277978897, -0.029188955202698708, 0.03626633435487747, -0.017194898799061775, 0.02570444345474243, 0.04900064319372177, -0.016917506232857704, -0.028049778193235397, 0.006486202124506235, 0.02427399903535843, 0.027132511138916016, -0.008034942671656609, 0.028201261535286903, -0.012508963234722614, -0.039964642375707626, 0.0046938275918364525, 0.04522009938955307, -0.010711025446653366, 0.007064377423375845, 0.0227164626121521, -0.013087153434753418, -0.058742303401231766, -0.002373982220888138, 0.03776001185178757, 0.015214385464787483, -0.03631720691919327, 0.00768375163897872, 0.02429092861711979, -0.0608411505818367, -0.002998577430844307, 0.036174703389406204, 0.08233819901943207, 0.10443001985549927, -0.011346359737217426, -0.03855200856924057, 0.0036716654431074858, -0.010882536880671978, 0.060681361705064774, 0.0003281179815530777, -0.02604178711771965, 0.048176880925893784, 0.007735955063253641, 0.035582996904850006, 0.016860388219356537, -0.00963517464697361, 0.012020505033433437, -0.04127650707960129, ...]",Getting ready for solids,https://www.healthhub.sg/live-healthy/baby-getting-ready-for-solids,1.0


## Connect to neo4j

In [102]:
uri = "neo4j://localhost:7687"
username = os.getenv("neo4j_username")
password = os.getenv("neo4j_password")
# driver = GraphDatabase.driver(uri, auth=(username, password))

NEO4J = {
    "uri": uri,
    "auth": (username, password),
    "database": CONTENT_CATEGORY,  # create this database in neo4j first
}

# Test connection
with GraphDatabase.driver(**NEO4J) as driver:
    try:
        driver.verify_connectivity()
        print("Connection estabilished.")
    except (DriverError, Neo4jError) as exception:
        print(exception)

Connection estabilished.


## Clustering

In [103]:
documents = articles_df.to_dict(orient="records")
documents[0].keys()

[1;35mdict_keys[0m[1m([0m[1m[[0m[32m'id'[0m, [32m'title'[0m, [32m'full_url'[0m, [32m'content'[0m, [32m'meta_description'[0m, [32m'vector_title'[0m, [32m'vector_article_category_names'[0m, [32m'vector_category_description'[0m, [32m'vector_extracted_content_body'[0m, [32m'Page Title'[0m, [32m'URL'[0m, [32m'ground_truth_cluster'[0m[1m][0m[1m)[0m

In [104]:
logging.basicConfig(level=logging.INFO)


def clear_db(tx):
    logging.info("Clearing database")
    tx.run("MATCH (n) DETACH DELETE n")


def create_graph_nodes(tx, doc):
    # logging.info("Create nodes")
    tx.run(
        """
    CREATE (d:Article {
        id: $id,
        title: $title,
        url: $url,
        content: $content,
        meta_desc: $meta_description,
        vector_body: $vector_body,
        vector_title: $vector_title,
        vector_category: $vector_category,
        vector_desc: $vector_desc,
        ground_truth: $ground_truth
    })""",
        id=doc["id"],
        title=doc["title"],
        url=doc["full_url"],
        content=doc["content"],
        meta_description=doc["meta_description"],
        vector_title=doc["vector_title"],
        vector_category=doc["vector_article_category_names"],
        vector_desc=doc["vector_category_description"],
        vector_body=doc["vector_extracted_content_body"],
        ground_truth=doc["ground_truth_cluster"],
    )


def create_sim_edges(tx, threshold):
    logging.info("Create edges")
    tx.run(
        """
    MATCH (a:Article), (b:Article)
    WHERE a.id < b.id
    WITH a, b, gds.similarity.cosine(a.vector_body, b.vector_body) AS similarity
    WHERE similarity > $threshold
    CREATE (a)-[:SIMILAR {similarity: similarity}]->(b)
    """,
        threshold=threshold,
    )


def drop_graph_projection(tx):
    result = tx.run(
        """
    CALL gds.graph.exists('articleGraph')
    YIELD exists
    RETURN exists
    """
    )
    if result.single()["exists"]:
        tx.run("CALL gds.graph.drop('articleGraph')")


def create_graph_proj(tx):
    # logging.info("Create projection")
    tx.run(
        """
           CALL gds.graph.project(
            'articleGraph',
            'Article',
            {
                SIMILAR: {
                    properties: 'similarity'
                }
            }
           )
    """
    )


def detect_community(tx):
    # logging.info("Detect community")
    tx.run(
        """
        CALL gds.louvain.write(
        'articleGraph',
        {
            writeProperty: 'community'
        }
        )
    """
    )


def return_community(tx):
    query = """
        MATCH (a:Article)
        RETURN a.community AS cluster, collect(a.title) AS articles
        ORDER BY cluster
        """
    result = tx.run(query)
    return [record for record in result]


def return_pred_cluster(tx):
    query = """
        MATCH (a:Article)
        RETURN a.id, a.title, a.url, a.community AS cluster
        ORDER BY a.community
        """
    result = tx.run(query)
    return [record for record in result]


def count_articles(tx):
    query = """
        MATCH (a:Article)
        RETURN a.community AS cluster, count(a) AS articleCount
        ORDER BY cluster
        """
    result = tx.run(query)
    return [record for record in result]


def return_by_cluster(tx):
    """Return only clusters with more than one article"""

    query = """
    MATCH (n)
    WITH n.community AS cluster, collect(n.title) AS titles, count(n) AS count
    WHERE count > 1
    RETURN cluster, titles
    ORDER BY cluster
        """
    result = tx.run(query)
    return [record for record in result]

In [105]:
with GraphDatabase.driver(**NEO4J) as driver:
    with driver.session() as session:
        session.execute_write(clear_db)  # Clear the database
        for doc in documents:
            session.execute_write(create_graph_nodes, doc)
        session.execute_write(create_sim_edges, THRESHOLD)
        session.execute_write(drop_graph_projection)
        session.execute_write(create_graph_proj)
        session.execute_write(detect_community)
        records = session.execute_read(return_community)
        pred_cluster = session.execute_read(return_pred_cluster)
        articles_count = session.execute_read(count_articles)
        cluster_articles = session.execute_read(return_by_cluster)

In [106]:
pred_cluster_df = pd.DataFrame(
    pred_cluster, columns=["id", "title", "url", "pred_cluster"]
)
pred_cluster_df.to_csv(OUTPUT_PREDICTED_CLUSTER_PATH)

cluster_article_count = pd.DataFrame(
    articles_count, columns=["pred_cluster_number", "article_count"]
)

In [107]:
results_df = pd.merge(
    articles_df, pred_cluster_df, how="inner", left_on="id", right_on="id"
)

results_df = results_df[
    ["id", "Page Title", "URL", "ground_truth_cluster", "pred_cluster"]
]
results_df["ground_truth_cluster"] = results_df["ground_truth_cluster"].astype(int)

results_df.head(2)

Unnamed: 0,id,Page Title,URL,ground_truth_cluster,pred_cluster
0,1443987,All You Need to Know About Childhood Immunisations,https://www.healthhub.sg/live-healthy/all-you-need-to-know-about-vaccinations,16,124
1,1442828,Getting ready for solids,https://www.healthhub.sg/live-healthy/baby-getting-ready-for-solids,1,67


## Cluster metrics

In [108]:
def get_exact_match(results_df):
    pred_cluster_labels = results_df.groupby("pred_cluster")["id"].apply(set).to_list()
    ground_cluster_labels = (
        results_df.groupby("ground_truth_cluster")["id"].apply(set).to_list()
    )
    complete_match = [s for s in pred_cluster_labels if s in ground_cluster_labels]

    return len(complete_match)


def fill_single(series):
    max_val = series.max()
    fill_in_val = max_val
    filled_series = series.copy()
    for idx in series[series.isna()].index:
        filled_series.at[idx] = fill_in_val + 1
        fill_in_val += 1
    return filled_series.to_list()


def compute_vmeasure(results_df):
    ground_truth_labels = fill_single(results_df["ground_truth_cluster"])
    predicted_labels = fill_single(results_df["pred_cluster"])
    homogeneity = homogeneity_score(ground_truth_labels, predicted_labels)
    completeness = completeness_score(ground_truth_labels, predicted_labels)
    v_measure = v_measure_score(ground_truth_labels, predicted_labels)

    return homogeneity, completeness, v_measure

In [109]:
min_count = cluster_article_count[cluster_article_count["article_count"] > 1][
    "article_count"
].min()
max_count = cluster_article_count["article_count"].max()
num_clusters = (cluster_article_count["article_count"] != 1).sum()
unclustered_count = (cluster_article_count["article_count"] == 1).sum()

exact_match = get_exact_match(results_df)
homogeneity, completeness, v_measure = compute_vmeasure(results_df)

data = pd.DataFrame(
    {
        "Model": [MODEL_NAME],
        "Threshold": [THRESHOLD],
        "Exact cluster match": [exact_match],
        "Homogeneity": [round(homogeneity, 4)],
        "Completeness": [round(completeness, 4)],
        "V-measure": [round(v_measure, 4)],
        "Number of clusters": [num_clusters],
        "Min cluster size": [min_count],
        "Max cluster size": [max_count],
        "Number of articles not clustered": [unclustered_count],
    }
)

In [110]:
if os.path.exists(OUTPUT_CLUSTER_METRICS_PATH):
    metrics_df = pd.read_csv(OUTPUT_CLUSTER_METRICS_PATH, index_col=0)
else:
    metrics_df = pd.DataFrame()

metrics_df = pd.concat([metrics_df, data], axis=0)
metrics_df.to_csv(OUTPUT_CLUSTER_METRICS_PATH)
metrics_df

Unnamed: 0,Model,Threshold,Exact cluster match,Homogeneity,Completeness,V-measure,Number of clusters,Min cluster size,Max cluster size,Number of articles not clustered
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7
0,all-mpnet-base-v2,0.7,13,0.7196,0.9554,0.8209,23,2,30,7


## Cluster Visualisation

In [111]:
query = """
MATCH (n)-[r]->(m)
RETURN n.title AS node_1, m.title AS node_2,
    r.similarity AS edge_weight, 
    n.ground_truth AS node_1_ground_truth, 
    m.ground_truth AS node_2_ground_truth, 
    n.community AS node_1_pred_cluster, 
    m.community AS node_2_pred_cluster,
    n.title AS node_1_title,
    m.title AS node_2_title
"""
# nodes with no relationship
query_2 = """MATCH (n)
WHERE NOT EXISTS ((n)--())
RETURN n.title AS node_title,
    n.ground_truth AS node_ground_truth,
    n.community AS node_community,
    n.meta_desc AS node_meta_desc
"""
with GraphDatabase.driver(**NEO4J) as driver:
    with driver.session() as session:
        results = session.run(query)
        data = pd.DataFrame(results.data())
        results_2 = session.run(query_2)
        data_2 = pd.DataFrame(results_2.data())

data["node_1"] = data["node_1"].astype(str)
data["node_2"] = data["node_2"].astype(str)
# data = data.dropna(subset=['node_2'])

data_2["node_community"] = ""

# save nodes and edges of clustered and unclustered (single nodes) data for visualisation
data.to_csv(os.path.join(NEO4J_FOLDER_PATH, f"{MODEL_NAME}_neo4j_clustered_data.csv"))
data_2.to_csv(
    os.path.join(NEO4J_FOLDER_PATH, f"{MODEL_NAME}_neo4j_unclustered_data.csv")
)

### Export for mongodb

In [112]:
# Only keep edges if 2 nodes are from same cluster
edges_in_same_cluster = data[data["node_1_pred_cluster"] == data["node_2_pred_cluster"]]
edges = edges_in_same_cluster[["node_1", "node_2", "edge_weight"]]
with open(
    os.path.join(NEO4J_FOLDER_PATH, f"{MODEL_NAME}_neo4j_edges.pkl"), "wb"
) as file:
    pickle.dump(edges, file)

In [113]:
pred_cluster_dict = [
    {"cluster": e["cluster"], "titles": e["titles"]} for e in cluster_articles
]
with open(
    os.path.join(NEO4J_FOLDER_PATH, f"{MODEL_NAME}_neo4j_pred_cluster.pkl"), "wb"
) as file:
    pickle.dump(pred_cluster_dict, file)

## End