In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Clustering process for policies

## 1. Import the relevant packages

In [None]:
import pandas as pd
import random

from process import extract_items
import clusterize

## 2. Prepare data for clustering

In [None]:
domain = 'E'
input_path = f"../data/results_{domain}.csv"

In [None]:
# Import data
df = pd.read_csv(input_path)
df

Unnamed: 0,GEOGRAPHIC,ITEM,FACTOR,SECTOR,abstract,doi,openalex_id
0,Mashhad University of Medical Sciences,mass media,"{""awareness about the benefits of physical act...","[""E""]",Background and Aim: Health-promoting lifestyle...,https://doi.org/10.22038/jmrh.2014.2918,W2099730092
1,United States,exergames,"{""physical activity (PA) in the general popula...","[""E"", ""J""]",The number of informal caregivers for family m...,https://doi.org/10.48550/arxiv.1908.09984,W2970340995
2,Puget Sound,telecommunications,"{""trip generation rates"": {""CORRELATION"": ""dec...","[""E""]",Most trip generation models are insensitive to...,https://doi.org/10.3141/1682-09,W2129598736
3,US,activewear purchased to be worn as casualwear,"{""intent to purchase activewear"": {""CORRELATIO...","[""E""]","In recent years, activewear is no longer conta...",https://doi.org/10.1080/17543266.2018.1477995,W2805060567
4,Indonesia,"advanced, automated internet technology","{""robust connectivity"": {""CORRELATION"": ""incre...","[""E""]",Abstract The Indonesian government has a goal ...,https://doi.org/10.2118/209896-ms,W4290471442
...,...,...,...,...,...,...,...
6935,"Samarinda, East Kalimantan province, Indonesia",IoT-based air pollution monitoring system usin...,"{""air quality information accessibility"": {""CO...","[""E""]",The degradation of air quality in numerous Ind...,https://doi.org/10.51967/tanesa.v24i2.2946,W4399888694
6936,Democratic Republic of Congo (DRC),privately funded GSM network,"{""mobile telephone coverage"": {""CORRELATION"": ...","[""E""]",The Democratic Republic of Congo (DRC) faces p...,,W3121765686
6937,Thailand,watching television and playing computer,"{""obesity"": {""CORRELATION"": ""increasing""}}","[""E""]","Objective: To determine the dietary pattern, ...",,W2161121663
6938,Pakistan,social media,"{""biogas technology adoption"": {""CORRELATION"":...","[""E""]",Environmental degradation and rapid climate ch...,https://doi.org/10.3390/ijerph17072311,W3013389524


In [None]:
# Apply the extraction
extracted_df = extract_items(df)
extracted_df

"rainfall estimation"
"rainfall estimation"


Unnamed: 0,index,ITEM,FACTOR,doi
0,0,mass media,awareness about the benefits of physical activity,https://doi.org/10.22038/jmrh.2014.2918
1,1,exergames,physical activity (PA) in the general population,https://doi.org/10.48550/arxiv.1908.09984
2,1,exergames,PA barriers that AD caregivers face,https://doi.org/10.48550/arxiv.1908.09984
3,2,telecommunications,trip generation rates,https://doi.org/10.3141/1682-09
4,3,activewear purchased to be worn as casualwear,intent to purchase activewear,https://doi.org/10.1080/17543266.2018.1477995
...,...,...,...,...
10376,6935,IoT-based air pollution monitoring system usin...,air quality information accessibility,https://doi.org/10.51967/tanesa.v24i2.2946
10377,6936,privately funded GSM network,mobile telephone coverage,
10378,6937,watching television and playing computer,obesity,
10379,6938,social media,biogas technology adoption,https://doi.org/10.3390/ijerph17072311


## 3. Cluster with HDBSCAN

### a. ITEMS

In [None]:
preprocessed_items = extracted_df[['index', 'ITEM']].drop_duplicates(subset='index')['ITEM']
preprocessed_items

0                                               mass media
1                                                exergames
3                                       telecommunications
4            activewear purchased to be worn as casualwear
6                  advanced, automated internet technology
                               ...                        
10376    IoT-based air pollution monitoring system usin...
10377                         privately funded GSM network
10378             watching television and playing computer
10379                                         social media
10380                                         social media
Name: ITEM, Length: 6890, dtype: object

In [None]:
corpus_list_items, corpus_embeddings_items = clusterize.get_embeddings(preprocessed_items)

In [None]:
params = {
    'umap_n_neighbors': 30,
    'umap_min_dist': 0.1,
    'umap_n_components': 30,
    'hdbscan_min_cluster_size': 70,
    'hdbscan_min_samples': 10,
    'hdbscan_cluster_selection_epsilon': 0.0,
    'hdbscan_max_cluster_size': None
}

hdbscan_model, cluster_assignment, clustered_sentences, noise, reduced_embeddings_items = clusterize.clusterize(
                            corpus_list_items, corpus_embeddings_items,
                            params,
                            print_clusters=False)

  warn(


Number of clusters found: 33


In [None]:
results = {
    'items_total': len(cluster_assignment),
    'items_noise': len(noise),
    'noise_pct': round(len(noise)/len(cluster_assignment) * 100, 2)
}

print(f'total number of items: {results['items_total']}, \nitems classified as noise: {results['items_noise']}, \npercentage of noise: {results['noise_pct']}')

total number of items: 6890, 
items classified as noise: 1542, 
percentage of noise: 22.38


In [None]:
cluster_summary_df = clusterize.get_cluster_summary(clustered_sentences)
cluster_summary_df

Unnamed: 0,Cluster Number,Number of Sentences,Sample Sentences
0,1,58,"limit Facebook, Instagram and Snapchat use to ..."
1,2,23,number of Internet users; internet dependency;...
2,3,40,advanced information and communication technol...
3,4,56,Internet of Things cloud server; Internet of T...
4,5,23,Digitalization; digitalisation of data; digita...
5,6,27,social digital media; Chinese lifestyle migrat...
6,7,53,Big Data; digital town hall; open data sources...
7,8,91,state instrumental use of digital sovereignty;...
8,9,68,development of solar powered solutions for the...
9,10,47,energy-efficient architecture for cloud and fo...


In [None]:
noise_df = clusterize.get_noise_sample(noise)
noise_df

Unnamed: 0,samples
0,touch simulation
1,use of computers
2,automated computer shutdown program
3,app for recording data
4,digital language inclusivity
...,...
95,poetry
96,high usage times (>five hours per day)
97,travel service information
98,hedonic information technologies


In [None]:
clusterize.save_results(params, results, noise_df, cluster_summary_df, hdbscan_model, domain, 'items')

In [None]:
## Re-clsutering of subdivision of clusters
### List of clusters to subdivide
clusters_to_subdivide_with_params = {
    7: {'min_cluster_size': 15, 'min_samples': 5, 'cluster_selection_epsilon': 0.1},
    9: {'min_cluster_size': 10, 'min_samples': 5, 'cluster_selection_epsilon': 0.1},
    16: {'min_cluster_size': 15, 'min_samples': 5, 'cluster_selection_epsilon': 0.1},
    23: {'min_cluster_size': 15, 'min_samples': 5, 'cluster_selection_epsilon': 0.1},
    24: {'min_cluster_size': 15, 'min_samples': 5, 'cluster_selection_epsilon': 0.1},
    27: {'min_cluster_size': 17, 'min_samples': 5, 'cluster_selection_epsilon': 0.1},
    30: {'min_cluster_size': 17, 'min_samples': 5, 'cluster_selection_epsilon': 0.1}
}

# Subdivide selected clusters and create a new dataframe
new_cluster_df, new_noise, cluster_assignment_to_change_indices, new_cluster_assignment = clusterize.subdivide_clusters(
    clustered_sentences, cluster_assignment, reduced_embeddings_items, preprocessed_items, clusters_to_subdivide_with_params
)

# Display the new dataframe
new_cluster_df

no noise generated by cluster 7
no noise generated by cluster 9
reclustering of cluster 16 adding 1 items to noise
reclustering of cluster 23 adding 11 items to noise
no noise generated by cluster 24
reclustering of cluster 27 adding 34 items to noise
reclustering of cluster 30 adding 1 items to noise


Unnamed: 0,cluster_id,sentences
0,1,"[social media usage, social media use, social ..."
1,2,"[internet usage, Internet use, internet use, i..."
2,3,[information and communication technologies (I...
3,4,"[Internet of Things utilization, technological..."
4,5,"[digitalisation of societies, technology and d..."
5,6,"[social media, social media, social media, web..."
6,8,"[digital technologies, digital technologies, d..."
7,10,"[data centres (DCs), data centers, data center..."
8,11,"[ICT utilization program, information technolo..."
9,12,"[social media collaboration, face-to-face coll..."


In [None]:
print(f'{new_cluster_df.shape[0] - cluster_summary_df.shape[0] + len(clusters_to_subdivide_with_params.keys())} new clusters were created out of {len(clusters_to_subdivide_with_params.keys())}')
print(f'{len(new_noise)} items from the previous clusters were classified as noise')

14 new clusters were created out of 7
47 items from the previous clusters were classified as noise


In [None]:
new_noise[:10]

['online pizza ordering system',
 'cloud radio access network (C-RAN) architecture',
 'optimization problem to allocate radio resources',
 'flexible service level agreements in Green IT',
 'Cloud Radio Access Network (C-RAN) architecture',
 'green communication technologies',
 'radio access network (RAN) energy consumption',
 'Green procurement',
 'Reverse Logistics in Green Procurement',
 'Cloud Radio Access Network (C-RAN) architecture']

In [None]:
data = {'noise_samples': random.sample(new_noise, min(100, len(new_noise)))}

# Creating DataFrame
new_noise_df = pd.DataFrame(data)

new_noise_df.to_csv(f'../outputs/noise_sample_recluster_{domain}_items.csv', index=False)

new_noise_df

Unnamed: 0,noise_samples
0,experimentation with novel strategies
1,persuasive messages
2,persuasive messages
3,persuasive strategies
4,digital leadership
5,Transformational Leadership
6,Reverse Logistics in Green Procurement
7,human-centered digital workplaces
8,Cloud Radio Access Network (C-RAN) architecture
9,positive leadership behaviour


In [None]:
sentences_df = pd.DataFrame(preprocessed_items.copy())

if domain == 'E': # TODO maintenant que doi dans extracted df, interagir directement avec cette colonne ?
    doi_col = df[(df['FACTOR'] != "\"None\"") & (df['FACTOR'] != "\"rainfall estimation\"")]['doi']

cluster_assignment_with_correct_index = [i + 1 if i != -1 else i for i in cluster_assignment]

final_df = clusterize.create_final_df(sentences_df, doi_col, cluster_assignment_with_correct_index, new_cluster_assignment, cluster_assignment_to_change_indices)
final_df

Unnamed: 0,ITEM,cluster,cluster_2,doi
0,mass media,22,22,https://doi.org/10.22038/jmrh.2014.2918
1,exergames,18,18,https://doi.org/10.48550/arxiv.1908.09984
3,telecommunications,24,24_0,https://doi.org/10.1080/17543266.2018.1477995
4,activewear purchased to be worn as casualwear,27,27_0,https://doi.org/10.2118/209896-ms
6,"advanced, automated internet technology",28,28,https://doi.org/10.2478/aup-2023-0012
...,...,...,...,...
10376,IoT-based air pollution monitoring system usin...,-1,-1,
10377,privately funded GSM network,24,24_0,
10378,watching television and playing computer,25,25,
10379,social media,6,6,


In [None]:
final_df.to_csv(f'../outputs/clusters_recluster_hdbscan_{domain}_items.csv', index=False)

In [None]:
recluster_params = pd.DataFrame.from_dict(clusters_to_subdivide_with_params, orient='index')
recluster_params.index.name = 'cluster_id'

recluster_params.to_csv(f'../outputs/params_recluster_{domain}_items.csv')

recluster_params

Unnamed: 0_level_0,min_cluster_size,min_samples,cluster_selection_epsilon
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,15,5,0.1
9,10,5,0.1
16,15,5,0.1
23,15,5,0.1
24,15,5,0.1
27,17,5,0.1
30,17,5,0.1


### c. FACTORS

In [None]:
preprocessed_factors = extracted_df['FACTOR']
preprocessed_factors

0        awareness about the benefits of physical activity
1         physical activity (PA) in the general population
2                      PA barriers that AD caregivers face
3                                    trip generation rates
4                            intent to purchase activewear
                               ...                        
10376                air quality information accessibility
10377                            mobile telephone coverage
10378                                              obesity
10379                           biogas technology adoption
10380                                       social support
Name: FACTOR, Length: 10381, dtype: object

In [None]:
corpus_list_factors, corpus_embeddings_factors = clusterize.get_embeddings(preprocessed_factors)

In [None]:
params = {
    'umap_n_neighbors': 10,
    'umap_min_dist': 0.1,
    'umap_n_components': 60,
    'hdbscan_min_cluster_size': 40,
    'hdbscan_min_samples': 5,
    'hdbscan_cluster_selection_epsilon': 0.2,
    'hdbscan_max_cluster_size': None #300
}

hdbscan_model, cluster_assignment, clustered_sentences, noise, reduced_embeddings_factors = clusterize.clusterize(
                            corpus_list_factors, corpus_embeddings_factors,
                            params,
                            print_clusters=False)

  warn(


Number of clusters found: 21


In [None]:
results = {
    'factors_total': len(cluster_assignment),
    'factors_noise': len(noise),
    'noise_pct': round(len(noise)/len(cluster_assignment) * 100, 2)
}

print(f'total number of factors: {results['factors_total']}, \nfactors classified as noise: {results['factors_noise']}, \npercentage of noise: {results['noise_pct']}')

total number of factors: 10381, 
factors classified as noise: 1000, 
percentage of noise: 9.63


In [None]:
cluster_summary_df = clusterize.get_cluster_summary(clustered_sentences)
cluster_summary_df

Unnamed: 0,Cluster Number,Number of Sentences,Sample Sentences
0,1,7,negative effects on well-being; personalised w...
1,2,3,freedom of expression; free expression; right ...
2,3,6,customer performance evaluation; internal cust...
3,4,6,sedentary life; sedentary lifestyle; carbon em...
4,5,12,Carbon dioxide emissions; per capita carbon em...
5,6,3,sleep quality; sleep outcomes; subjective well...
6,7,2,well-being at work; well-being
7,8,11,carbon footprint; carbon footprint of EDR; ene...
8,9,14,embodied greenhouse gas emissions; GHG emissio...
9,10,19,academic stress; occupational stress; symptoms...


In [None]:
noise_df = clusterize.get_noise_sample(noise)
noise_df

Unnamed: 0,samples
0,CSR message engagement
1,moderation effect of product types
2,lifestyle quality
3,HRM
4,living standards
...,...
95,urban-rural income gap
96,urban–rural income gap
97,lifestyle quality
98,awareness


In [None]:
clusterize.save_results(params, results, noise_df, cluster_summary_df, hdbscan_model, domain, 'factors')

In [None]:
clusters_to_subdivide_with_params = {
    4: {'min_cluster_size': 10, 'min_samples': 5, 'cluster_selection_epsilon': 0.1},
    #15: {'min_cluster_size': 8, 'min_samples': 5, 'cluster_selection_epsilon': 0.0},
    20: {'min_cluster_size': 10, 'min_samples': 5, 'cluster_selection_epsilon': 0.1}
}

# Subdivide selected clusters and create a new dataframe
new_cluster_df, new_noise, cluster_assignment_to_change_indices, new_cluster_assignment = clusterize.subdivide_clusters(
    clustered_sentences, cluster_assignment, reduced_embeddings_factors, preprocessed_factors, clusters_to_subdivide_with_params
)

# Display the new dataframe
new_cluster_df

no noise generated by cluster 4
reclustering of cluster 20 adding 33 items to noise


Unnamed: 0,cluster_id,sentences
0,1,"[subjective well-being, subjective well-being,..."
1,2,"[freedom of expression, freedom of expression,..."
2,3,"[customer satisfaction, customer satisfaction,..."
3,5,"[CO2 emissions, CO2 emissions, carbon dioxide ..."
4,6,"[sleep quality, sleep quality, sleep quality, ..."
5,7,"[well-being, well-being, well-being, well-bein..."
6,8,"[environmental footprint, carbon footprint, ca..."
7,9,"[embodied greenhouse gas emissions, embodied g..."
8,10,"[stress management, stress resistance, stress ..."
9,11,"[Impulse Buying, Impulse Buying, Impulse Buyin..."


In [None]:
print(f'{new_cluster_df.shape[0] - cluster_summary_df.shape[0] + len(clusters_to_subdivide_with_params.keys())} new clusters were created out of {len(clusters_to_subdivide_with_params.keys())}')
print(f'{len(new_noise)} items from the previous clusters were classified as noise')

36 new clusters were created out of 2
33 items from the previous clusters were classified as noise


In [None]:
sentences_df = pd.DataFrame(preprocessed_factors.copy())

doi_col = extracted_df['doi']

cluster_assignment_with_correct_index = [i + 1 if i != -1 else i for i in cluster_assignment]

final_df = clusterize.create_final_df(sentences_df, doi_col, cluster_assignment_with_correct_index, new_cluster_assignment, cluster_assignment_to_change_indices)
final_df

Unnamed: 0,FACTOR,cluster,cluster_2,doi
0,awareness about the benefits of physical activity,20,20_16,https://doi.org/10.22038/jmrh.2014.2918
1,physical activity (PA) in the general population,20,20_16,https://doi.org/10.48550/arxiv.1908.09984
2,PA barriers that AD caregivers face,20,20_32,https://doi.org/10.48550/arxiv.1908.09984
3,trip generation rates,20,20_32,https://doi.org/10.3141/1682-09
4,intent to purchase activewear,20,20_32,https://doi.org/10.1080/17543266.2018.1477995
...,...,...,...,...
10376,air quality information accessibility,20,20_32,https://doi.org/10.51967/tanesa.v24i2.2946
10377,mobile telephone coverage,20,20_32,
10378,obesity,20,20_32,
10379,biogas technology adoption,-1,-1,https://doi.org/10.3390/ijerph17072311


In [None]:
data = {'noise_samples': random.sample(new_noise, min(100, len(new_noise)))}

# Creating DataFrame
new_noise_df = pd.DataFrame(data)

new_noise_df.to_csv(f'../outputs/noise_sample_recluster_{domain}_factors.csv', index=False)

new_noise_df

Unnamed: 0,noise_samples
0,use of strategies to obtain these indicators
1,positive leadership behaviour
2,use of diversity technique
3,persuasive strategies
4,use of strategies to obtain these indicators
5,green communication technologies
6,competence in English
7,persuasive messages
8,Transformational Leadership
9,alcohol marketing


In [None]:
recluster_params = pd.DataFrame.from_dict(clusters_to_subdivide_with_params, orient='index')
recluster_params.index.name = 'cluster_id'

recluster_params.to_csv(f'../outputs/params_recluster_{domain}_factors.csv')

recluster_params

Unnamed: 0_level_0,min_cluster_size,min_samples,cluster_selection_epsilon
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,10,5,0.1
20,10,5,0.1
