In [1]:
import os
import pickle

import pandas as pd
import pyvis

In [46]:
root_path = r"C:\Users\Joycelyn\Documents\Synapxe\07 GenAI for healthhub\Github backup\cluster experiments\weighted emb vs sim (658)"
emb_path_th_median = os.path.join(root_path, "nomic_weighted_emb (0.7 body, 0.3 title)")
emb_path_th_09 = os.path.join(
    root_path, "nomic_weighted_emb (0.7 body, 0.3 title) th09"
)


with open(os.path.join(emb_path_th_median, "neo4j_predicted_clusters.pkl"), "rb") as f:
    emb_results_median = pickle.load(f)

with open(os.path.join(emb_path_th_09, "neo4j_predicted_clusters.pkl"), "rb") as f:
    emb_results_th09 = pickle.load(f)

## identify common and unique clusters

In [47]:
def find_common_clusters(output1, output2):
    clusters1 = {tuple(cluster["titles"]): cluster for cluster in output1}
    clusters2 = {tuple(cluster["titles"]): cluster for cluster in output2}

    common_titles = set(clusters1.keys()) & set(clusters2.keys())

    common_clusters = [clusters1[titles] for titles in common_titles]
    df_common_clusters = pd.DataFrame(common_clusters)

    df_common_clusters["num_articles"] = df_common_clusters["titles"].apply(
        lambda x: len(x)
    )

    return df_common_clusters


def find_different_clusters(output1, output2):
    clusters1 = {tuple(cluster["titles"]): cluster for cluster in output1}
    clusters2 = {tuple(cluster["titles"]): cluster for cluster in output2}

    unique_titles1 = set(clusters1.keys()) - set(clusters2.keys())
    unique_titles2 = set(clusters2.keys()) - set(clusters1.keys())

    unique_clusters1 = [clusters1[titles] for titles in unique_titles1]
    unique_clusters2 = [clusters2[titles] for titles in unique_titles2]

    df_unique_clusters1 = pd.DataFrame(unique_clusters1)
    df_unique_clusters2 = pd.DataFrame(unique_clusters2)

    df_unique_clusters1["num_articles"] = df_unique_clusters1["titles"].apply(
        lambda x: len(x)
    )
    df_unique_clusters2["num_articles"] = df_unique_clusters2["titles"].apply(
        lambda x: len(x)
    )

    return df_unique_clusters1, df_unique_clusters2

In [65]:
df_common_clusters = find_common_clusters(emb_results_median, emb_results_th09)
df_unique_median, df_unique_09 = find_different_clusters(
    emb_results_median, emb_results_th09
)

print(
    f"Number of exact same clusters: {len(df_common_clusters)} \
    \nNumber of unique clusters from th=median: {len(df_unique_emb)} \
    \nNumber of unique clusters from th=0.9: {len(df_unique_sim)}"
)

Number of exact same clusters: 10     
Number of unique clusters from th=median: 62     
Number of unique clusters from th=0.9: 51


## Match unique clusters

In [92]:
# match unique clusters
# df_unique_median_1 = df_unique_median.copy()
# df_unique_09_1 = df_unique_09.copy()


# Function to calculate overlap
def calculate_overlap(row1, row2):
    return len(set(row1) & set(row2))


# List to store the matches
matches = []

for i, row1 in df_unique_median.iterrows():
    for j, row2 in df_unique_09.iterrows():
        overlap = calculate_overlap(row1["titles"], row2["titles"])
        matches.append((i, j, overlap))

matches_df = pd.DataFrame(matches, columns=["embeddings_index", "sim_index", "overlap"])

# Identify the pairs with the highest overlap
max_matches_df = matches_df.loc[
    matches_df.groupby("embeddings_index")["overlap"].idxmax()
]
# Sort by sim_index and overlap and keep the sim_index with highest overlap
max_matches_df = max_matches_df[max_matches_df["overlap"] != 0].sort_values(
    ["sim_index", "overlap"], ascending=[True, False]
)
max_matches_df = max_matches_df.drop_duplicates(subset=["sim_index"], keep="first")
max_matches_df

# Prepare data for full outer join
df_unique_median["key"] = df_unique_median.index
df_unique_09["key"] = df_unique_09.index

# Merge the DataFrames using the identified pairs
merged_df_1to1 = (
    pd.merge(
        df_unique_median,
        max_matches_df[["embeddings_index", "sim_index", "overlap"]],
        left_index=True,
        right_on="embeddings_index",
        how="outer",
    )
    .merge(
        df_unique_09,
        left_on="sim_index",
        right_index=True,
        how="outer",
        suffixes=("_1", "_2"),
    )
    .drop(columns=["embeddings_index", "sim_index", "key_1", "key_2"])
)

## match unique clusters (1 to many)

In [90]:
# List to store the matches
matches = []

for i, row1 in df_unique_median.iterrows():
    for j, row2 in df_unique_09.iterrows():
        overlap = calculate_overlap(row1["titles"], row2["titles"])
        matches.append((i, j, overlap))

matches_df = pd.DataFrame(matches, columns=["embeddings_index", "sim_index", "overlap"])
matches_df = matches_df[matches_df["overlap"] != 0].sort_values(
    ["embeddings_index", "overlap"], ascending=[True, False]
)

# Merge the DataFrames using the identified pairs
merged_df_1tomany = (
    pd.merge(
        df_unique_median,
        matches_df[["embeddings_index", "sim_index", "overlap"]],
        left_index=True,
        right_on="embeddings_index",
        how="outer",
    )
    .merge(
        df_unique_09,
        left_on="sim_index",
        right_index=True,
        how="outer",
        suffixes=("_1", "_2"),
    )
    .drop(columns=["embeddings_index", "sim_index", "key_1", "key_2"])
)

In [212]:
grouped_df = (
    merged_df_1tomany.groupby("cluster_1")
    .agg(
        {
            "cluster_2": lambda x: x.count(),
            "num_articles_2": lambda x: x,
        }
    )
    .reset_index()
)

grouped_merge = pd.merge(
    grouped_df,
    merged_df_1tomany[["cluster_1", "cluster_keywords_1", "num_articles_1"]],
    left_on="cluster_1",
    right_on="cluster_1",
    how="left",
)
grouped_merge = grouped_merge.loc[
    :,
    [
        "cluster_1",
        "cluster_keywords_1",
        "num_articles_1",
        "cluster_2",
        "num_articles_2",
    ],
]
grouped_merge = grouped_merge.rename(
    columns={
        "cluster_1": "cluster",
        "cluster_keywords_1": "cluster_keywords",
        "num_articles_1": "num_articles",
        "cluster_2": "th0.9 - num_of_clusters",
        "num_articles_2": "th0.9 - articles size",
    }
)
grouped_merge = grouped_merge.drop_duplicates(subset=["cluster"])
print(grouped_merge.shape)
grouped_merge.head(2)

(62, 5)


Unnamed: 0,cluster,cluster_keywords,num_articles,th0.9 - num_of_clusters,th0.9 - articles size
0,45,"[diabetes, insulin, sugar, glucose, hcs]",23,1,2.0
1,51,"[water, glass, waste, drinking, ice]",3,1,2.0


## Export excel

In [210]:
# export dfs
save_path = os.path.join(root_path, "th09_eval.xlsx")
with pd.ExcelWriter(save_path) as writer:
    df_common_clusters.to_excel(writer, sheet_name="Common Clusters", index=False)
    df_unique_emb.to_excel(writer, sheet_name="th=median unique articles", index=False)
    df_unique_sim.to_excel(writer, sheet_name="th=0.9 sim unique articles", index=False)
    merged_df_1to1.to_excel(writer, sheet_name="match_1to1", index=False)
    merged_df_1tomany.to_excel(writer, sheet_name="match_1tomany", index=False)
    grouped_merge.to_excel(writer, sheet_name="cluster_change", index=False)

## visualise unique cluster

In [7]:
def visualize_result(clustered_df, method: str):
    visual_graph = pyvis.network.Network(select_menu=True, filter_menu=True)

    # Add nodes-nodes pair
    for _, row in clustered_df.iterrows():
        # Add nodes
        visual_graph.add_node(
            row["node_1_title"],
            label=row["node_1_title"],
            title=f"\nPredicted: {row['node_1_pred_cluster']}\nTitle: {row['node_1_title']}\nKeywords: {row['node_1_cluster_kws']}",
            group=row["node_1_cluster_kws"],
            cluster_num=row["node_1_pred_cluster"],
        )
        visual_graph.add_node(
            row["node_2_title"],
            label=row["node_2_title"],
            title=f"\nPredicted: {row['node_2_pred_cluster']}\nTitle: {row['node_2_title']}\nKeywords: {row['node_2_cluster_kws']}",
            group=row["node_2_cluster_kws"],
            cluster_num=row["node_2_pred_cluster"],
        )

        # Add edge
        visual_graph.add_edge(
            row["node_1_title"],
            row["node_2_title"],
            title=f"Edge Weight: {row['edge_weight']}",
        )

    visual_graph.show(f"neo4j_{method}.html", notebook=False)


def get_unique_inter_grp(df):
    df = df[df["node_1_pred_cluster"] != df["node_2_pred_cluster"]]
    unique_pairs = df[["node_1_pred_cluster", "node_2_pred_cluster"]].drop_duplicates()
    num_rows_with_unique_pairs = unique_pairs.shape[0]
    return num_rows_with_unique_pairs

In [43]:
clustered_nodes_emb = pd.read_csv(os.path.join(emb_path, "neo_4j_clustered_data.csv"))
clustered_nodes_sim = pd.read_csv(os.path.join(sim_path, "neo_4j_clustered_data.csv"))

In [44]:
print("Number of intergroup connections")
print(f"EMB method: {len(clustered_nodes_emb[clustered_nodes_emb["node_1_pred_cluster"] != clustered_nodes_emb["node_2_pred_cluster"]])}")
print(f"SIM method: {len(clustered_nodes_sim[clustered_nodes_sim["node_1_pred_cluster"] != clustered_nodes_sim["node_2_pred_cluster"]])}")

print("
Number of unique interconnected groups")
print(f"EMB method: {get_unique_inter_grp(clustered_nodes_emb)}")
print(f"SIM method: {get_unique_inter_grp(clustered_nodes_sim)}")

Number of intergroup connections
EMB method: 15
SIM method: 35

Number of unique interconnected groups
EMB method: 7
SIM method: 15


In [10]:
# visualisation for unique emb clusters
unique_emb_list = list(df_unique_emb["cluster"])
clustered_nodes_emb_filtered = clustered_nodes_emb[
    clustered_nodes_emb["node_1_pred_cluster"].isin(unique_emb_list)
    & clustered_nodes_emb["node_2_pred_cluster"].isin(unique_emb_list)
]
visualize_result(clustered_nodes_emb_filtered, "emb")

neo4j_emb.html


In [11]:
# visualisation for unique sim clusters
unique_sim_list = list(df_unique_sim["cluster"])
clustered_nodes_sim_filtered = clustered_nodes_sim[
    clustered_nodes_sim["node_1_pred_cluster"].isin(unique_sim_list)
    & clustered_nodes_sim["node_2_pred_cluster"].isin(unique_sim_list)
]
visualize_result(clustered_nodes_sim_filtered, "sim")

neo4j_sim.html


## Comparing unique clusters

In [22]:
emb_cluster_num = 532
sim_cluster_num = 592

emb_titles_list = df_unique_emb[df_unique_emb["cluster"] == emb_cluster_num][
    "titles"
].iloc[0]
sim_titles_list = df_unique_sim[df_unique_sim["cluster"] == sim_cluster_num][
    "titles"
].iloc[0]

emb_set = set(emb_titles_list)
sim_set = set(sim_titles_list)
common_articles = emb_set.intersection(sim_set)

print(
    f"Emb cluster size: {len(emb_titles_list)} \
    \nSim cluster size: {len(sim_titles_list)} \
    \nNumber of overlap articles: {len(common_articles)}"
)

print(
    f"\nUnique articles in EMB cluster: \
    \n{list(emb_set - sim_set)} \
    \n\nUnique articles in SIM cluster: \
    \n{list(sim_set - emb_set)}"
)

Emb cluster size: 41     
Sim cluster size: 36     
Number of overlap articles: 31

Unique articles in EMB cluster:     
['Know Your Alcohol Limit: Don’t Be a Party Pooper!', 'Drinking Myths Busted!', 'Responsible Drinking: Know Your Alcohol Limit', 'Staying Sober and Within the Alcohol Limit', 'Ditch Both that Cigarette and Drink!', 'Alcohol — More than Meets the Eye', "Life is Better When You're Sober", 'Drinking (or not) to a Healthy Chinese New Year', 'Why is Binge Drinking Bad for You?', 'Alcohol and Health—Set Your Drinking Limits']     

Unique articles in SIM cluster:     
['Are e-cigarettes harmful?', 'Effects of Secondhand Smoke on Your Child’s Health', '"Vaping is not smoking", and Other Tobacco Myths', 'Environmental Tobacco Smoke', 'Smoke-free Environment for a Healthier Family']


In [23]:
# For unique articles in Weighted SIM cluster
article_titles = []
clusters = []

list_2 = list(sim_set - emb_set)
for item in list_2:
    # print(item)
    filtered_df = df_unique_emb_1[df_unique_emb_1["titles"].apply(lambda x: item in x)]
    if filtered_df.empty:
        cluster = "single node"
    else:
        cluster = filtered_df["cluster_keywords"].iloc[0]

    article_titles.append(item)
    clusters.append(cluster)

data = {"article_title": article_titles, "cluster_keywords": clusters}
# sorted_df  = df.sort_values(by="cluster_keywords")

df = pd.DataFrame(data)
df["cluster_keywords"] = df["cluster_keywords"].apply(lambda x: ", ".join(map(str, x)))

grouped_df = df.groupby("cluster_keywords").apply(lambda x: x)
grouped_df

  grouped_df = df.groupby('cluster_keywords').apply(lambda x: x)


Unnamed: 0_level_0,Unnamed: 1_level_0,article_title,cluster_keywords
cluster_keywords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"cigarettes, tobacco, shisha, cigarette, vapourisers",0,Are e-cigarettes harmful?,"cigarettes, tobacco, shisha, cigarette, vapour..."
"cigarettes, tobacco, shisha, cigarette, vapourisers",2,"""Vaping is not smoking"", and Other Tobacco Myths","cigarettes, tobacco, shisha, cigarette, vapour..."
"smoke, ets, second, cigarette, third",1,Effects of Secondhand Smoke on Your Child’s He...,"smoke, ets, second, cigarette, third"
"smoke, ets, second, cigarette, third",3,Environmental Tobacco Smoke,"smoke, ets, second, cigarette, third"
"smoke, ets, second, cigarette, third",4,Smoke-free Environment for a Healthier Family,"smoke, ets, second, cigarette, third"


In [28]:
# For unique articles in Weighted EMD cluster
article_titles = []
clusters = []

list_1 = list(emb_set - sim_set)
for item in list_1:
    # print(item)
    filtered_df = df_unique_sim_1[df_unique_sim_1["titles"].apply(lambda x: item in x)]
    if filtered_df.empty:
        cluster = "single node"
    else:
        cluster = filtered_df["cluster_keywords"].iloc[0]

    article_titles.append(item)
    clusters.append(cluster)

data = {"article_title": article_titles, "cluster_keywords": clusters}
# sorted_df  = df.sort_values(by="cluster_keywords")

df = pd.DataFrame(data)
df["cluster_keywords"] = df["cluster_keywords"].apply(lambda x: ", ".join(map(str, x)))

grouped_df = df.groupby("cluster_keywords").apply(lambda x: x)
grouped_df

  grouped_df = df.groupby('cluster_keywords').apply(lambda x: x)


Unnamed: 0_level_0,Unnamed: 1_level_0,article_title,cluster_keywords
cluster_keywords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"alcohol, drinking, alcoholic, drink, binge",0,Know Your Alcohol Limit: Don’t Be a Party Pooper!,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",1,Drinking Myths Busted!,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",2,Responsible Drinking: Know Your Alcohol Limit,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",3,Staying Sober and Within the Alcohol Limit,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",4,Ditch Both that Cigarette and Drink!,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",5,Alcohol — More than Meets the Eye,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",6,Life is Better When You're Sober,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",7,Drinking (or not) to a Healthy Chinese New Year,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",8,Why is Binge Drinking Bad for You?,"alcohol, drinking, alcoholic, drink, binge"
"alcohol, drinking, alcoholic, drink, binge",9,Alcohol and Health—Set Your Drinking Limits,"alcohol, drinking, alcoholic, drink, binge"


In [29]:
filtered_df = grouped_df[
    grouped_df["cluster_keywords"] == "alcohol, drinking, alcoholic, drink, binge"
]
for index, row in filtered_df.iterrows():
    print(row["article_title"])

Know Your Alcohol Limit: Don’t Be a Party Pooper!
Drinking Myths Busted!
Responsible Drinking: Know Your Alcohol Limit
Staying Sober and Within the Alcohol Limit
Ditch Both that Cigarette and Drink!
Alcohol — More than Meets the Eye
Life is Better When You're Sober
Drinking (or not) to a Healthy Chinese New Year
Why is Binge Drinking Bad for You?
Alcohol and Health—Set Your Drinking Limits


## Evaluate single nodes

In [16]:
emb_path

'C:\\Users\\Joycelyn\\Documents\\Synapxe\\07 GenAI for healthhub\\Github backup\\cluster experiments\\weighted emb vs sim (658)\\nomic_weighted_emb (0.7 body, 0.3 title)'

In [17]:
emb_unclustered = pd.read_csv(os.path.join(emb_path, "neo_4j_unclustered_data.csv"))
sim_unclustered = pd.read_csv(os.path.join(sim_path, "neo_4j_unclustered_data.csv"))

emb_unclustered

Unnamed: 0,node_title,node_ground_truth,node_community,node_meta_desc
0,Molar Incisor Hypomineralisation (MIH),,3,Your Guide to Understanding Molar Incisor Hypo...
1,Asthma (Common Childhood Illnesses),,4,Asthma affects about one in five children in S...
2,Understanding Leong's Premolars (LP),,5,Your Guide to Understanding Leong's Premolars ...
3,Chlamydia,,8,"Known as a silent disease, early symptoms of c..."
4,"Gonorrhoea: Symptoms, Treatment and Prevention",,9,What is gonorrhoea and how is it transmitted? ...
...,...,...,...,...
180,Baby Friendly Hospital Initiative,,638,The Baby Friendly Hospital Initiative (BFHI) s...
181,"Hand, Foot, and Mouth Disease",,642,"Hand, Foot, and Mouth Disease is present all y..."
182,Breast cancer,,646,Breast cancer is the most common cancer among ...
183,Colorectal Cancer,,652,"<span data-contrast=""auto"" class=""TextRun SCXW..."


In [18]:
emb_unclustered_list = list(emb_unclustered["node_title"])
sim_unclustered_list = list(sim_unclustered["node_title"])

In [19]:
overlap = set(emb_unclustered_list).intersection(set(sim_unclustered_list))
emb_not_overlap = list(set(emb_unclustered_list) - set(sim_unclustered_list))
sim_not_overlap = list(set(sim_unclustered_list) - set(emb_unclustered_list))

len(overlap), len(emb_not_overlap), len(sim_not_overlap)

(153, 32, 31)

In [20]:
print(
    f"total single nodes (EMB): {len(emb_unclustered_list)} \
    \ntotal single nodes (SIM): {len(sim_unclustered_list)} \
    \nnumber of overlaps: {len(overlap)}"
)

total single nodes (EMB): 185     
total single nodes (SIM): 184     
number of overlaps: 153


In [21]:
for item in emb_not_overlap:
    print(item)

Why Is Sleep Important for Kids?
How to Get in Shape Before Your Beach Holiday
6 Blood Tests Every Mum-to-Be Undergoes
I’ve Been Giving My Child Dessert as a Reward. How Do I Undo It?
Tests for me and my baby
Gearing up for birth
Diabetes and High Blood Cholesterol
Daily Habits To Help Fight The Belly Bulge
Gonorrhoea: Symptoms, Treatment and Prevention
The Post-Partum Weight Loss Journey
Losing Weight When You Have Diabetes
How to Get Your Parents to Get Moving and Adopt an Active Lifestyle
Dance Your Way To Health
5 Ways to Get Your Dad Bod Back into Shape after ORD
Breast cancer
Exercise at Home: Burn Calories While Watching TV!
A Caregiver’s Guide: Planning Healthy Meals and Well-Balanced Diets
Older Adults Need More Protein
Q&A: Is My Baby's Spit-Up Normal?
Asthma (Common Childhood Illnesses)
10 Nutrition and Healthy Eating Myths
Say No to Drugs, Here and Abroad
Say No To Drugs
Why Protein Is More Important Than You May Think, And 5 Common Protein Myths
How to Care for Your Teeth 

## Check category tags

In [35]:
from collections import Counter

In [36]:
pred_cluster = pd.read_csv(os.path.join(emb_path, "predicted_cluster.csv"))
filtered_data = pd.read_parquet(
    "../data/03_primary/filtered_data_with_keywords.parquet"
)

# Remove cluster size = 1
cluster_counts = pred_cluster["cluster"].value_counts()
clusters_to_keep = cluster_counts[cluster_counts > 1].index
pred_cluster_keep = pred_cluster[pred_cluster["cluster"].isin(clusters_to_keep)]

article_cat_name = filtered_data[["id", "article_category_names"]]
article_cat_name["article_category_names"] = article_cat_name[
    "article_category_names"
].apply(lambda x: x.strip(",") if str(x) != "None" else x)
article_cat_name["article_category_names"] = article_cat_name[
    "article_category_names"
].apply(lambda x: x.split(",") if str(x) != "None" else [None])

pred_cluster_article_cat = pd.merge(
    pred_cluster_keep, article_cat_name, how="left", on=["id"]
)
groupby_article_cat = (
    pred_cluster_article_cat.groupby("cluster")
    .agg({"article_category_names": lambda x: sum(x, []), "id": "count"})
    .reset_index()
)

groupby_article_cat.columns = ["cluster", "article_category_names", "cluster_size"]


def process_category_list(category_list):
    count_dict = dict(Counter(category_list))
    none_count = count_dict.pop(None, 0)
    sorted_dict = dict(
        sorted(count_dict.items(), key=lambda item: item[1], reverse=True)
    )
    return sorted_dict, none_count


groupby_article_cat[["category_value_counts", "none_count"]] = groupby_article_cat[
    "article_category_names"
].apply(lambda x: pd.Series(process_category_list(x)))
groupby_article_cat["article_category_names unique"] = groupby_article_cat[
    "category_value_counts"
].apply(lambda x: sorted(x.keys()))
groupby_article_cat["No. of article_category_names unique"] = groupby_article_cat[
    "article_category_names unique"
].apply(len)
groupby_article_cat = groupby_article_cat.drop("article_category_names", axis=1)

groupby_article_cat.sort_values("cluster_size", ascending=False).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  article_cat_name['article_category_names'] = article_cat_name['article_category_names'].apply(lambda x: x.strip(',') if str(x)!='None' else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  article_cat_name['article_category_names'] = article_cat_name['article_category_names'].apply(lambda x: x.split(',') if str(x)!='None' else [None])


Unnamed: 0,cluster,cluster_size,category_value_counts,none_count,article_category_names unique,No. of article_category_names unique
64,620,130,"{'Exercise and Fitness': 102, 'Food and Nutrit...",0,"[Body Care, Conditions and Illnesses, Exercise...",6
8,123,106,"{'Pregnancy and Infant Health': 24, 'Food and ...",26,"[Body Care, Child & Teen Health, Child and Tee...",8
53,532,82,"{'Body Care': 46, 'Mind and Balance': 10, 'Int...",2,"[Body Care, Conditions and Illnesses, Food & N...",7
45,431,68,"{'Food and Nutrition': 50, 'Child and Teen Hea...",4,"[Body Care, Child and Teen Health, Exercise an...",5
60,595,62,"{'Food and Nutrition': 52, 'Conditions and Ill...",0,"[Child and Teen Health, Conditions and Illness...",4


In [37]:
save_path = os.path.join(emb_path, "groupby_article_cat.csv")
groupby_article_cat.to_csv(save_path)

## End