In [378]:
import pickle
import pandas as pd
import nltk
nltk.download('punkt')  # Ensure NLTK's punkt package is downloaded for sentence tokenization
import json
import re


[nltk_data] Downloading package punkt to
[nltk_data]     /home/pratimathapa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [379]:
from bertopic import BERTopic
import csv
import matplotlib.pyplot as plt
from bertopic.representation import KeyBERTInspired


In [380]:
# Provide the path to your pickle file
file_path = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/notebooks/scanned_eng.pkl'

# Read the pickle file
with open(file_path, 'rb') as file:
    data = pickle.load(file)

data


Unnamed: 0,Country,Language,Text
0,Eswatini,English,piae jn\nPi SAT OR face\nSUPPLEMENT TO\nTHE\nS...
1,Jamaica,English,16.\n\n17.\n\nDISASTER RISK MANAGEMENT\n\nTHE ...
2,Turks and Caicos Islands,English,Page | of 70\n\n \n\n*\n\nTURKS AND CAICOS IS...
3,The Gambia,English,"\nNATIONAL DISASTER MANAGEMENT ACT, 2008\n\n..."
4,Cook Islands,English,\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\nPWNS\n...
5,Tanzania,English,"THE DISASTER MANAGEMENT ACT, 2015\n\nARRANGEME..."
6,Federated States of Micronesia,English,PRESIDENTIAL COMm. NUs_@=L22eey\nFSM CONGRESS\...


In [375]:
# Create a DataFrame from your data
df = pd.DataFrame(data)

# Convert the 'Text' column to string type explicitly
df['Text'] = df['Text'].astype(str)

# Remove newline characters from the 'Text' column
df['Text'] = df['Text'].str.replace('\n', ' ')

# Remove add a tab characters from the 'Text' column
df['Text'] = df['Text'].str.replace('\t', ' ')

# Display the updated DataFrame
print(df.head())


                    Country Language  \
0                  Eswatini  English   
1                   Jamaica  English   
2  Turks and Caicos Islands  English   
3                The Gambia  English   
4              Cook Islands  English   

                                                Text  
0  piae jn Pi SAT OR face SUPPLEMENT TO THE SWAZI...  
1  16.  17.  DISASTER RISK MANAGEMENT  THE DISAST...  
2  Page | of 70      *  TURKS AND CAICOS ISLANDS ...  
3    NATIONAL DISASTER MANAGEMENT ACT, 2008  ARR...  
4                          PWNS  15. 16.  118  Th...  


In [376]:
n_df = pd.DataFrame(df.loc[6])
i_data = n_df.transpose()
i_data


Unnamed: 0,Country,Language,Text
6,Federated States of Micronesia,English,PRESIDENTIAL COMm. NUs_@=L22eey FSM CONGRESS ...


In [377]:
# Tokenize the 'Text' column into sentences
sentences = []
for index, row in i_data.iterrows():
    text = row['Text']
    sentences.extend(nltk.sent_tokenize(text))

# Create a new DataFrame with sentences and corresponding language and country
sentences_df = pd.DataFrame({
    'Sentence': sentences,
    'Language': i_data['Language'].repeat([len(nltk.sent_tokenize(text)) for text in i_data['Text']]),
    'Country': i_data['Country'].repeat([len(nltk.sent_tokenize(text)) for text in i_data['Text']])
})

# Display the resulting DataFrame with sentences
sentences_df

# Save the DataFrame to a CSV file
csv_output_path = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/csv_files/Federated States of Micronesia.csv'  # Replace with your desired CSV output path
sentences_df.to_csv(csv_output_path, index=False)

print(f"Sentences extracted from the paragraphs and saved to '{csv_output_path}'.")


Sentences extracted from the paragraphs and saved to '/home/pratimathapa/code/PratimaThapa/DisLex_Project/csv_files/Federated States of Micronesia.csv'.


In [353]:
#Cleaning the Sentence column in the dataframe

# Remove rows with only individual numbers
pattern = r'^\d+(\.)?$'
sentences_df = sentences_df[~sentences_df['Sentence'].str.match(pattern)]

# Remove numbers or symbols at the beginning of sentences
pattern = r'^[\d\s—()-]+'
sentences_df['Sentence'] = sentences_df['Sentence'].str.replace(pattern, '', regex=True)

# Reset index after dropping rows
sentences_df.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
sentences_df


Unnamed: 0,Sentence,Language,Country
0,"THE DISASTER MANAGEMENT ACT, 2005 __________...",English,India
1,"Short title, extent and commencement.",English,India
2,Definitions.,English,India
3,CHAPTER II THE NATIONAL DISASTER MANAGEMEN...,English,India
4,Establishment of National Disaster Management ...,English,India
...,...,...,...
326,"In particular, and without prejudice to the ge...",English,India
327,Every rule made by the State Government under ...,English,India
328,Power to remove difficulties.,English,India
329,If any difficulty arises in gi ving effect to ...,English,India


In [354]:
# Creating a list of dictionary of the keywords

name = ['disaster_origin', 'actors', 'disaster_cycle', 'governance_modalities', 'overarching_principles']

words = [
    ['natural', 'earthquake', 'floods', 'weather', 'drought', 'landslides', 'cyclones', 'tsunami', 'climate', 'volcanoes', 'technological disaster',
     'pollution', 'pandemic', 'epidemic', 'disease', 'conflict', 'war', 'civil conflict', 'multi hazards', 'cross sectoral'],
    ['national', 'national authorities', 'president', 'prime minister', 'centralized', 'federal', 'regional government', 'local government',
     'provincial government', 'decentralized', 'chief ministers', 'cabinet ministers', 'district', 'director general', 'parliament', 'parliamentary',
     'members of parliament', 'executive', 'legal actors', 'supreme court', 'high court', 'tribunal', 'administrative', 'administration',
     'bureaucratic capacity', 'private actors', 'companies', 'multinational', 'United Nations', 'European Union', 'international', 'African Union',
     'International development agencies', 'financial institutions', 'military', 'army', 'soldiers', 'military asset'],
    ['disaster risk reduction', 'disaster risk', 'DRR', 'mitigation', 'disaster preparedness', 'preparedness', 'planning', 'response',
     'disaster response', 'emergency', 'relief', 'early warning', 'recovery', 'reconstruction', 'rehabilitation', 'capacity development'],
    ['state of emergency declaration', 'state of exception', 'state of alarm', 'courts', 'judges', 'legal review', 'participatory', 'participation',
     'decision', 'decision making', 'responsibility', 'implementation', 'implementing', 'accountability', 'liability', 'reporting', 'coordination',
     'cooperation', 'network', 'collaboration', 'partnership', 'multiorganizational', 'disaster management agency', 'disaster management platform',
     'personnel', 'migrants', 'gender', 'minority', 'women', 'young people', 'elderly', 'old people', 'disabled people', 'vulnerable communities',
     'vulnerable people', 'children', 'financially unstable', 'no housing', 'lgbtqi+', 'fund', 'funding', 'financing', 'resource management'],
    ['humanitarian principles', 'neutrality', 'independence', 'impartiality', 'humanity', 'Sustainable development goals', 'Sendai Framework',
     'international humanitarian law', 'IHL', 'Geneva Convention', 'International Community of the Red Cross', 'international guidelines',
     'international frameworks', 'human rights', 'right-based approach', 'democratic resilience', 'democratic continuity', 'democratic decision making',
     'quality standards', 'anti corruption', 'corruption', 'financial transparency', 'sovereignty', 'sovereign', 'access to information', 'compensation',
     'compensatory measures', 'legal protection', 'civil protection']
]

# Create a list of dictionaries using a list comprehension
result_list = [
    {"name": n, "words": w} for n, w in zip(name, words)
]

# Display the resulting list of dictionaries
for item in result_list:
    print(item)

# Saving the dictionary as a json file

# Define the file path where you want to save the JSON file
#file_path = '/home/pratimathapa/code/PratimaThapa/DisLex_Project/notebooks/keywords.json'

# Save the list of dictionaries as a JSON file
#with open(file_path, 'w') as json_file:
    #json.dump(result_list, json_file, indent=4)

#print(f"JSON file saved to '{file_path}'")


{'name': 'disaster_origin', 'words': ['natural', 'earthquake', 'floods', 'weather', 'drought', 'landslides', 'cyclones', 'tsunami', 'climate', 'volcanoes', 'technological disaster', 'pollution', 'pandemic', 'epidemic', 'disease', 'conflict', 'war', 'civil conflict', 'multi hazards', 'cross sectoral']}
{'name': 'actors', 'words': ['national', 'national authorities', 'president', 'prime minister', 'centralized', 'federal', 'regional government', 'local government', 'provincial government', 'decentralized', 'chief ministers', 'cabinet ministers', 'district', 'director general', 'parliament', 'parliamentary', 'members of parliament', 'executive', 'legal actors', 'supreme court', 'high court', 'tribunal', 'administrative', 'administration', 'bureaucratic capacity', 'private actors', 'companies', 'multinational', 'United Nations', 'European Union', 'international', 'African Union', 'International development agencies', 'financial institutions', 'military', 'army', 'soldiers', 'military asset

# Using BERTopic for topic modelling

In [355]:
# Define a function to create seed list from a dictionary
def get_seed_lists(dictionary, ngram_size):
    # create list of topics wit max ngram_size
    seeds = []
    for topic in dictionary:
        seed = [w for w in topic["words"] if len(w.split()) <= ngram_size]
        seeds.append(seed)
    return seeds


In [356]:
# Create seed list from dictionary
seeds = get_seed_lists(result_list, 1)
print(seeds)


[['natural', 'earthquake', 'floods', 'weather', 'drought', 'landslides', 'cyclones', 'tsunami', 'climate', 'volcanoes', 'pollution', 'pandemic', 'epidemic', 'disease', 'conflict', 'war'], ['national', 'president', 'centralized', 'federal', 'decentralized', 'district', 'parliament', 'parliamentary', 'executive', 'tribunal', 'administrative', 'administration', 'companies', 'multinational', 'international', 'military', 'army', 'soldiers'], ['DRR', 'mitigation', 'preparedness', 'planning', 'response', 'emergency', 'relief', 'recovery', 'reconstruction', 'rehabilitation'], ['courts', 'judges', 'participatory', 'participation', 'decision', 'responsibility', 'implementation', 'implementing', 'accountability', 'liability', 'reporting', 'coordination', 'cooperation', 'network', 'collaboration', 'partnership', 'multiorganizational', 'personnel', 'migrants', 'gender', 'minority', 'women', 'elderly', 'children', 'lgbtqi+', 'fund', 'funding', 'financing'], ['neutrality', 'independence', 'impartiali

In [357]:
# Prepare embeddings
from sentence_transformers import SentenceTransformer
docs = sentences_df.Sentence
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Use KeyBERTInspired representation model.
# This gives better names to the topics.
representation_model = KeyBERTInspired()
embeddings = sentence_model.encode(docs, show_progress_bar=True)


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [358]:
display(docs.shape)
embeddings.shape


(331,)

(331, 384)

In [363]:
# load BERT model paraphrase-MiniLM-L3-v2 (multilingual) or all-MiniLM-L6-v2 (english)
# setting min_topic_size to 7 and n_grams from 1 to 3
# we need to explore these parameters. Other parameters:
# https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
# guided topic modeling: https://maartengr.github.io/BERTopic/getting_started/guided/guided.html
# seed_topic_list = [["corruption"],
#                   ["elections", "election", "assembly"],
#                  ["freedom", "liberty"]]

seed_topic_list = get_seed_lists(result_list, 3)
topic_model = BERTopic(representation_model=representation_model,
                       seed_topic_list=seed_topic_list,
                       verbose=True,
                       embedding_model='all-MiniLM-L6-v2',
                       min_topic_size = 50,
                       n_gram_range=(1, 3)
                      ).fit(docs, embeddings)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-12-04 09:10:04,470 - BERTopic - Reduced dimensionality
2023-12-04 09:10:04,497 - BERTopic - Clustered reduced embeddings


In [364]:
topic_model.visualize_documents(docs, embeddings=embeddings)


In [365]:
# visualize topic words/n_grams
fig = topic_model.visualize_barchart(top_n_topics=10)
fig.show()
