In [1]:
import pandas as pd
import os

In [2]:
%load_ext kedro.ipython

In [3]:
export_folder_path = r"..\data\08_reporting"
input_folder_path = r"..\data\01_raw\annotated_contents"

### Get IDs for optimisation
- Taking into account HH user annotation in `Stage 1 user annotation for HPB_HHcomments_20Aug24.xlsx`
- Remove all articles with the comment 'exclude-XXX' in `HH's team's comments` column

In [4]:
user_annotation = pd.read_excel(input_folder_path+'/Stage 1 user annotation for HPB_HHcomments_20Aug24.xlsx', sheet_name=3)
for_optimisation = user_annotation[~user_annotation['HH\'s team\'s comments'].fillna('').str.contains('exclude')]

for_optimisation_id = for_optimisation[['article_id']]
print('\nNumber of articles for optimisation: ', len(for_optimisation))
for_optimisation_value_counts = for_optimisation['content category'].value_counts().reset_index().sort_values('content category')
print(for_optimisation_value_counts) # to update data_count.xlsx

export_path = os.path.join(export_folder_path,'ids_for_optimisation.csv')
for_optimisation_id.to_csv(export_path, index=False)


Number of articles for optimisation:  244
              content category  count
0           cost-and-financing      1
1      diseases-and-conditions     24
2        live-healthy-articles    218
3  medical-care-and-facilities      1


### Get updated blacklist

1. Blacklist articles labelled as 'exclude' in `Stage 1 user annotation for HPB_HHcomments_20Aug24.xlsx`
2. Blacklist 'Nasi Liwet with Brown Rice' article with 'recipe' as reason

In [None]:
# Load current articles that are blacklisted

initial_blacklist = catalog.load('params:blacklist')
initial_blacklist_df = pd.DataFrame(list(initial_blacklist.items()), columns = ['article_id','remove_type'])

# Concat initial blacklist with new blacklist
to_exclude = user_annotation[user_annotation['HH\'s team\'s comments'].fillna('').str.contains('exclude')]
to_exclude = to_exclude[['article_id','HH\'s team\'s comments']]
to_exclude.columns = ['article_id','remove_type']
print('No. of articles to add on for blacklist: ', len(to_exclude))
to_exclude['remove_type'] = to_exclude['remove_type'].str.replace('exclude-','')
to_exclude['remove_type'] = to_exclude['remove_type'].apply(lambda x: x.title())
to_exclude['remove_type'] = to_exclude['remove_type'].str.replace('No Relevant Content And Mainly Link','No relevant content and mainly links')
to_exclude['remove_type'] = to_exclude['remove_type'].str.replace('Table Of Contents','Table of Contents')
blacklist_update = pd.concat([initial_blacklist_df, to_exclude])

# Get article id of nasi liwet 
merged_data = catalog.load('merged_data')
add_on_title_to_blacklist = ['Nasi Liwet with Brown Rice'] 
add_on_blacklist_lower = [title.lower() for title in add_on_title_to_blacklist]
add_on_blacklist_id_title = merged_data[merged_data['title'].str.lower().apply(lambda x: any(blacklist_title in x for blacklist_title in add_on_blacklist_lower))][['id', 'title']]
print(add_on_blacklist_id_title)

# Label the blacklist reason
added_blacklist = {1445657: 'Recipe'}
added_blacklist_df = pd.DataFrame(list(added_blacklist.items()), columns = ['article_id','remove_type'])

# Update blacklist
blacklist_update_2 = pd.concat([blacklist_update, added_blacklist_df])
blacklist_update_2.sort_values('remove_type', inplace=True)
blacklist_update_2.drop_duplicates('article_id', inplace=True)

# Write new blacklist into a txt file
export_path = os.path.join(folder_path,'blacklist_update.txt')
with open(export_path, 'w') as file:
    for i, row in blacklist_update_2.iterrows():
        file.write(f"{row['article_id']}: \"{row['remove_type']}\"\n")

print('\nNumber of articles in blacklist:',blacklist_update_2.shape[0])

No. of articles to add on for blacklist:  32


           id                       title
1505  1445657  Nasi Liwet with Brown Rice

Number of articles in blacklist: 40


### Before proceeding:

1. Paste the updated blacklist (`blacklist_update.txt`) into `parameters_data_processing.yml`
2. Run `kedro run --pipeline=data_processing` to obtain the updated `merged_data.parquet`

In [None]:
%reload_kedro

### Get Excluded Articles Tab and Data Count

In [None]:
merged_data = catalog.load('merged_data')
merged_data_HPB = merged_data[merged_data['pr_name'].fillna('').str.contains('Health Promotion Board')]
to_keep_content_category = ['cost-and-financing','diseases-and-conditions','live-healthy-articles','medical-care-and-facilities','support-group-and-others']
merged_data_HPB_cat = merged_data_HPB[merged_data_HPB['content_category'].isin(to_keep_content_category)]
print(merged_data_HPB_cat.shape)

print('\nValue Counts Before Filtering: \n') # data_count.xlsx - 'HPB raw data count in excel'
hpb_raw_value_counts = pd.DataFrame(merged_data_HPB_cat.content_category.value_counts()).sort_values('content_category').reset_index()
print(hpb_raw_value_counts)
to_remove = merged_data_HPB_cat[merged_data_HPB_cat['to_remove'] == True]
to_remove_value_counts =  pd.DataFrame(to_remove.content_category.value_counts()).sort_values('content_category').reset_index()

print('\nValue Counts of to_remove articles:',len(to_remove),'\n') # data_count.xlsx - 'to_remove articles (final)'
print(to_remove_value_counts) 


## Exclude tab to exclude those duplicated URL and content articles (considered backend issues)
additional_exclusion_from_excluded_tab = [1444417, 1445629, 1445972] 
exclude_tab = to_remove[['id','title','full_url','content_category','page_views','article_category_names','remove_type']]
exclude_tab = exclude_tab[~exclude_tab.id.isin(additional_exclusion_from_excluded_tab)]
exclude_tab_value_counts = pd.DataFrame(exclude_tab.content_category.value_counts()).sort_values('content_category').reset_index()
hpb_raw_value_counts_exclude_duplicated = merged_data_HPB_cat[~merged_data_HPB_cat.id.isin(additional_exclusion_from_excluded_tab)].content_category.value_counts()
hpb_raw_value_counts_exclude_duplicated = pd.DataFrame(hpb_raw_value_counts_exclude_duplicated).sort_values('content_category').reset_index()
print('\n\nValue Counts Before Filtering (Exclude Duplicated):\n', hpb_raw_value_counts_exclude_duplicated)
print('\n\nValue Counts of to_remove articles (Exclude Duplicated):')
print(exclude_tab_value_counts)
exclude_tab.to_csv(f'excluded_content_{len(exclude_tab)}articles.csv',index=False)


(704, 39)

Value Counts Before Filtering: 

              content_category  count
0           cost-and-financing      1
1      diseases-and-conditions     42
2        live-healthy-articles    659
3  medical-care-and-facilities      2

Value Counts of to_remove articles: 75 

          content_category  count
0  diseases-and-conditions      2
1    live-healthy-articles     73


Value Counts Before Filtering (Exclude Duplicated):
               content_category  count
0           cost-and-financing      1
1      diseases-and-conditions     42
2        live-healthy-articles    656
3  medical-care-and-facilities      2


Value Counts of to_remove articles (Exclude Duplicated):
          content_category  count
0  diseases-and-conditions      2
1    live-healthy-articles     70


In [None]:
# To update in data_count.xlsx - 'HPB only to_remove articles Counts (final)'
remove_type_value_counts = pd.DataFrame(to_remove.remove_type.value_counts()).sort_values('remove_type').reset_index()
remove_type_value_counts

# Add sum row
total_sum = remove_type_value_counts['count'].sum()
total_row = pd.DataFrame([['Total', total_sum]], columns=['remove_type', 'count'])
excluded_tab_article_counts_df = pd.concat([remove_type_value_counts, total_row], ignore_index=True)
print(excluded_tab_article_counts_df)

                            remove_type  count
0                    Duplicated Content      1
1                        Duplicated URL      2
2                           Infographic     14
3  No relevant content and mainly links      6
4                                Recipe     45
5                    Services Directory      3
6                     Table of Contents      4
7                                 Total     75


### Sent for Clustering Data Count

In [None]:
filtered_data_with_keywords = catalog.load("filtered_data_with_keywords")
print('No. of articles sent for clustering:', len(filtered_data_with_keywords))
cluster_value_counts = filtered_data_with_keywords.content_category.value_counts().reset_index().sort_values('content_category')
print(cluster_value_counts)

No. of articles sent for clustering: 629
              content_category  count
0           cost-and-financing      1
1      diseases-and-conditions     40
2        live-healthy-articles    586
3  medical-care-and-facilities      2


### (To Combine) Data Count

In [None]:
annotation = pd.read_excel(input_folder_path+'/user_annotation_25jul.xlsx', sheet_name=1)
annotation_indiv = annotation[(annotation['Action'] == 'Individual') & (annotation['Algorithm remarks'].notna())]

first_cluster_result = pd.read_excel(input_folder_path+'/final_predicted_clusters (first iteration).xlsx')
first_cluster_result_cluster = first_cluster_result[first_cluster_result['group_keywords'].notna()]

# Remove those annotated by HH to be individual articles
first_cluster_result_cluster = first_cluster_result_cluster[~first_cluster_result_cluster['id'].isin(annotation_indiv.article_id)]

# Remove those in blacklist
blacklisted_articles_id = blacklist_update_2['article_id'].to_list()
first_cluster_result_cluster = first_cluster_result_cluster[~first_cluster_result_cluster['id'].isin(blacklisted_articles_id)]

first_cluster_result_cluster_w_cat = pd.merge(first_cluster_result_cluster,merged_data_HPB_cat[['id','content_category']], how='left', on='id')
to_combine_value_counts = first_cluster_result_cluster_w_cat['content_category'].value_counts().reset_index().sort_values('content_category')

### Consolidate Data Count

In [None]:
hpb_raw_value_counts.columns=['content_category','HPB raw data count in excel']
hpb_raw_value_counts_exclude_duplicated.columns=['content_category','HPB raw data count in excel (Excludes Duplicated)']
to_remove_value_counts.columns=['content_category','to_remove articles']
exclude_tab_value_counts.columns=['content_category','to_remove articles (Exclude Duplicated)']
cluster_value_counts.columns=['content_category','Sent for Clustering']
to_combine_value_counts.columns=['content_category','Articles in clusters identified']
for_optimisation_value_counts.columns=['content_category','Individual for optimisation']
for_optimisation_value_counts


In [None]:
pd.concat([hpb_raw_value_counts,to_remove_value_counts,])