In [23]:
# SDG Topic Modeling and Trend Analysis - Jupyter Notebook Template

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import os
import re

# Load CSVs from directory
folder_path = "C:/Users/WELCOME/Desktop/17 SDGs/Sustainable_Goals"
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]


In [24]:
topic_model.visualize_topics()


In [25]:
import pandas as pd
import glob
import os

# Path to your folder containing all SDG CSV files
folder_path = r"C:\Users\WELCOME\Desktop\17 SDGs\Sustainable_Goals\*"

# Load all CSV files
all_files = glob.glob(os.path.join(folder_path, "*.csv"))

df_list = []
for file in all_files:
    df = pd.read_csv(file)
    df['sdg'] = os.path.basename(file).replace(".csv", "")
    df_list.append(df)

# Combine all into a single DataFrame
data = pd.concat(df_list, ignore_index=True)


In [26]:
# Clean the Year column
data = data[pd.to_numeric(data['Year'], errors='coerce').notnull()]
data['Year'] = data['Year'].astype(int)


In [27]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer


In [28]:
docs = data["Abstract"].dropna().tolist()


In [29]:
model = BERTopic()
topics, _ = model.fit_transform(docs)


In [30]:
model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,932,-1_in_and_of_to,"[in, and, of, to, the, nepal, for, was, from, ...",[Background: The World Health Organization (WH...
1,0,214,0_health_insurance_nepal_tobacco,"[health, insurance, nepal, tobacco, facilities...",[Nepal has one of the highest proportions of o...
2,1,118,1_poverty_multidimensional_reduction_nepal,"[poverty, multidimensional, reduction, nepal, ...",[… The FGT poverty index (index proposed by Fo...
3,2,111,2_gender_women_equality_womens,"[gender, women, equality, womens, violence, em...",[This research investigates the impact of cult...
4,3,100,3_climate_change_adaptation_farmers,"[climate, change, adaptation, farmers, agricul...",[Nepal is one of the four most vulnerable coun...
...,...,...,...,...,...
218,217,11,217_bhabar_gravel_siwalik_zone,"[bhabar, gravel, siwalik, zone, plain, recharg...",[The Siwalik foothill is bounded between the S...
219,218,11,218_springs_groundwater_mountainous_spring,"[springs, groundwater, mountainous, spring, te...",[Groundwater is the lifeline for the people re...
220,219,11,219_khola_erosion_bank_godavari,"[khola, erosion, bank, godavari, malekhu, stre...",[The fifth order Godavari Khola is flowing fro...
221,220,11,220_cleft_lip_palate_clp,"[cleft, lip, palate, clp, andor, anomalies, or...",[Introduction: Cleft lip and/or palate is a co...


In [31]:
model.get_topic(0)  # Replace 0 with any topic number


[('health', np.float64(0.05057329705882662)),
 ('insurance', np.float64(0.01715453991724435)),
 ('nepal', np.float64(0.012533815283378404)),
 ('tobacco', np.float64(0.0115049585956731)),
 ('facilities', np.float64(0.011192734055793215)),
 ('scheme', np.float64(0.009029017557415113)),
 ('medicines', np.float64(0.008709290150385006)),
 ('services', np.float64(0.008619419190805184)),
 ('care', np.float64(0.008490017073078766)),
 ('survey', np.float64(0.007973306536115415))]

In [32]:
# Assuming `topic_model` is your BERTopic model and `docs` is your list of documents

# Get topic info as a dataframe
topics_info = topic_model.get_topic_info()
print(topics_info)

# Loop to print topic keywords and counts
for topic_num in topics_info['Topic']:
    if topic_num == -1:
        continue  # Skip outlier topic
    keywords = topic_model.get_topic(topic_num)
    print(f"Topic {topic_num}: {[word for word, _ in keywords]}")


     Topic  Count                                             Name  \
0       -1    962                              -1_health_and_in_of   
1        0    179  0_poverty_reduction_inequality_multidimensional   
2        1    115                 1_education_school_schools_nepal   
3        2    107                   2_production_seed_farmers_cost   
4        3    106                 3_water_samples_drinking_quality   
..     ...    ...                                              ...   
219    218     11       218_springs_groundwater_mountainous_spring   
220    219     11                   219_bhabar_gravel_siwalik_zone   
221    220     11                         220_cleft_lip_palate_clp   
222    221     11             221_urban_fgc_farming_municipalities   
223    222     11                  222_nov_species_syn_butterflies   

                                        Representation  \
0    [health, and, in, of, to, the, on, nepal, wate...   
1    [poverty, reduction, inequality, multi

In [33]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Fit BERTopic model
topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)  # 'docs' is your list of abstracts or full texts


In [34]:
# Get topic info
topics_info = topic_model.get_topic_info()

# Show topic keywords
for topic_num in topics_info['Topic']:
    if topic_num == -1:
        continue
    keywords = topic_model.get_topic(topic_num)
    print(f"Topic {topic_num}: {[word for word, _ in keywords]}")


Topic 0: ['poverty', 'aids', 'hiv', 'inequality', 'households', 'economic', 'reduction', 'nepal', 'multidimensional', 'rural']
Topic 1: ['education', 'school', 'nepal', 'schools', 'inclusive', 'formal', 'policy', 'basic', 'quality', 'ece']
Topic 2: ['production', 'seed', 'farmers', 'cost', 'marketing', 'potato', 'nrs', 'vegetable', 'cultivation', 'benefit']
Topic 3: ['health', 'facilities', 'nepal', 'institutes', 'services', 'system', 'public', 'suggestions', 'facility', 'sector']
Topic 4: ['water', 'samples', 'drinking', 'coliform', 'quality', 'ph', 'conductivity', 'copper', 'mg', 'nitrate']
Topic 5: ['jats', '', '', '', '', '', '', '', '', '']
Topic 6: ['stunting', 'children', 'adolescents', 'motor', 'malnutrition', 'function', 'obesity', 'nutritional', 'girls', 'underweight']
Topic 7: ['text', 'full', 'available', 'stem', 'cells', 'in', 'unsolicited', 'solicited', 'tra', 'publishes']
Topic 8: ['remittance', 'foreign', 'employment', 'migration', 'remittances', 'migrants', 'consumptio