# Topic modelling using BERTopic

## Libraries/data required

In [2]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [4]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

In [4]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

# changing bertopic

## standard

In [30]:
from bertopic import BERTopic
from umap import UMAP

umap_model = UMAP(random_state=42)

if os.path.exists('southsudan_model2'):
    bertopic_standard = BERTopic.load('southsudan_model2')
else:
    bertopic_standard = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model) # Initialize the BERTopic model
    bertopic_standard.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic_standard.save("southsudan_model2") # Save the trained model as "southsudan_model"

In [31]:
bertopic_standard.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6559,-1_the_and_of_to,"[the, and, of, to, in, south, sudan, article, ...",[The article discusses the United States' deep...
1,0,377,0_food_wfp_million_hunger,"[food, wfp, million, hunger, people, famine, a...",[The article discusses how USAID has provided ...
2,1,355,1_kiir_his_author_president,"[kiir, his, author, president, salva, kiirs, c...",[The article discusses the political crisis in...
3,2,331,2_abyei_referendum_ngok_area,"[abyei, referendum, ngok, area, misseriya, din...",[The article discusses the African Union Peace...
4,3,227,3_million_aid_humanitarian_funding,"[million, aid, humanitarian, funding, people, ...",[The article discusses the US President author...
...,...,...,...,...,...
241,240,10,240_kalaazar_disease_azar_kala,"[kalaazar, disease, azar, kala, flies, cases, ...",[The article discusses the outbreak of Kala-az...
242,241,10,241_refugees_batil_hygiene_respiratory,"[refugees, batil, hygiene, respiratory, nile, ...",[The article discusses the alarming health sit...
243,242,10,242_comic_defections_conflict_salva,"[comic, defections, conflict, salva, political...",[The article discusses the ongoing conflict be...
244,243,10,243_children_unicef_families_app,"[children, unicef, families, app, million, str...",[The article discusses how violence and insecu...


## PCA dimensionality reduction

In [None]:
from sklearn.decomposition import PCA
pca_model = PCA(random_state=42, n_components=300)

#if os.path.exists('southsudan_model3'):
#    bertopic_PCA = BERTopic.load('southsudan_model3')
#else:
bertopic_PCA = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=pca_model)
bertopic_PCA.fit_transform(docs)
bertopic_PCA.save("southsudan_model3")

bertopic_PCA.get_topic_info()

## KMeans clustering

In [None]:
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=250, random_state=42)

if os.path.exists('southsudan_model4'):
    bertopic_KM = BERTopic.load('southsudan_model4')
else:
    bertopic_KM = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, hdbscan_model=kmeans_model)
    bertopic_KM.fit_transform(docs)
    bertopic_KM.save("southsudan_model4")

bertopic_KM.get_topic_info()

## PCA with KMeans

In [None]:
pca_model = PCA(random_state=42, n_components=50)
kmeans_model = KMeans(n_clusters=50, random_state=42)

if os.path.exists('southsudan_model5'):
    bertopic_PCA_KM = BERTopic.load('southsudan_model5')
else:
    bertopic_PCA_KM = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=pca_model, hdbscan_model=kmeans_model)
    bertopic_PCA_KM.fit_transform(docs)
    bertopic_PCA_KM.save("southsudan_model5")

bertopic_PCA_KM.get_topic_info()

## change the range of n-grams

In [None]:
if os.path.exists('southsudan_model6'):
    bertopic_NG = BERTopic.load('southsudan_model6')
else:
    bertopic_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, n_gram_range=(1, 3))
    bertopic_NG.fit_transform(docs)
    bertopic_NG.save("southsudan_model6")

bertopic_NG.get_topic_info()

## PCA and n-gram

In [None]:
pca_model = PCA(random_state=42, n_components=150)

if os.path.exists('southsudan_model7'):
    bertopic_PCA_NG = BERTopic.load('southsudan_model7')
else:
    bertopic_PCA_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=pca_model, n_gram_range=(1, 3))
    bertopic_PCA_NG.fit_transform(docs)
    bertopic_PCA_NG.save("southsudan_model7")

bertopic_PCA_NG.get_topic_info()

## KMeans and n-gram

In [None]:
kmeans_model = KMeans(n_clusters=150, random_state=42)

if os.path.exists('southsudan_model8'):
    bertopic_KM_NG = BERTopic.load('southsudan_model8')
else:
    bertopic_KM_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, hdbscan_model=kmeans_model, n_gram_range=(1, 3))
    bertopic_KM_NG.fit_transform(docs)
    bertopic_KM_NG.save("southsudan_model8")

bertopic_KM_NG.get_topic_info()

## PCA and KMeans and n-gram

In [None]:
pca_model = PCA(random_state=42, n_components=50)
kmeans_model = KMeans(n_clusters=50, random_state=42)

if os.path.exists('southsudan_model9'):
    bertopic_PCA_KM_NG = BERTopic.load('southsudan_model9')
else:
    bertopic_PCA_KM_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=pca_model, hdbscan_model=kmeans_model, n_gram_range=(1, 3))
    bertopic_PCA_KM_NG.fit_transform(docs)
    bertopic_PCA_KM_NG.save("southsudan_model9")

bertopic_PCA_KM_NG.get_topic_info()

# remove stopwords

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")

if os.path.exists('southsudan_model10'):
    bertopic_VM = BERTopic.load('southsudan_model10')
else:
    bertopic_VM = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, vectorizer_model=vectorizer_model)
    bertopic_VM.fit_transform(docs)
    bertopic_VM.save("southsudan_model10")

bertopic_VM.get_topic_info()

## remove stopwords and KM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
kmeans_model = KMeans(n_clusters=150, random_state=42)

if os.path.exists('southsudan_model11'):
    bertopic_VM_KM = BERTopic.load('southsudan_model11')
else:
    bertopic_VM_KM = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, vectorizer_model=vectorizer_model, hdbscan_model=kmeans_model)
    bertopic_VM_KM.fit_transform(docs)
    bertopic_VM_KM.save("southsudan_model11")

bertopic_VM_KM.get_topic_info()

## remove stopwords PCA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
pca_model = PCA(random_state=42, n_components=150)
vectorizer_model = CountVectorizer(stop_words="english")

if os.path.exists('southsudan_model18'):
    bertopic_VM_PCA = BERTopic.load('southsudan_model18')
else:
    bertopic_VM_PCA = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=pca_model, vectorizer_model=vectorizer_model)
    bertopic_VM_PCA.fit_transform(docs)
    bertopic_VM_PCA.save("southsudan_model18")

bertopic_VM_PCA.get_topic_info()

## remove stopwords and PCA and KM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
pca_model = PCA(random_state=42, n_components=50)
vectorizer_model = CountVectorizer(stop_words="english")
kmeans_model = KMeans(n_clusters=50, random_state=42)

if os.path.exists('southsudan_model12'):
    bertopic_VM_KM_PCA = BERTopic.load('southsudan_model12')
else:
    bertopic_VM_KM_PCA = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=pca_model, vectorizer_model=vectorizer_model, hdbscan_model=kmeans_model)
    bertopic_VM_KM_PCA.fit_transform(docs)
    bertopic_VM_KM_PCA.save("southsudan_model12")

bertopic_VM_KM_PCA.get_topic_info()

## remove stopwords and KM and n-gram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
kmeans_model = KMeans(n_clusters=150, random_state=42)

if os.path.exists('southsudan_model16'):
    bertopic_VM_KM_NG = BERTopic.load('southsudan_model16')
else:
    bertopic_VM_KM_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, vectorizer_model=vectorizer_model, hdbscan_model=kmeans_model, n_gram_range=(1, 3))
    bertopic_VM_KM_NG.fit_transform(docs)
    bertopic_VM_KM_NG.save("southsudan_model16")

bertopic_VM_KM_NG.get_topic_info()

## remove stopwords and PCA and n-gram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
pca_model = PCA(random_state=42, n_components=150)

#if os.path.exists('southsudan_model19'):
#    bertopic_VM_PCA_NG = BERTopic.load('southsudan_model19')
#else:
bertopic_VM_PCA_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model, umap_model=pca_model, n_gram_range=(1, 3))
bertopic_VM_PCA_NG.fit_transform(docs)
bertopic_VM_PCA_NG.save("southsudan_model19")

bertopic_VM_PCA_NG.get_topic_info()

## remove stopwords and PCA and KM and n-gram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
pca_model = PCA(random_state=42, n_components=50)
vectorizer_model = CountVectorizer(stop_words="english")
kmeans_model = KMeans(n_clusters=50, random_state=42)

if os.path.exists('southsudan_model13'):
    bertopic_VM_KM_PCA_NG = BERTopic.load('southsudan_model13')
else:
    bertopic_VM_KM_PCA_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=pca_model, vectorizer_model=vectorizer_model, hdbscan_model=kmeans_model, n_gram_range=(1, 3))
    bertopic_VM_KM_PCA_NG.fit_transform(docs)
    bertopic_VM_KM_PCA_NG.save("southsudan_model13")

bertopic_VM_KM_PCA_NG.get_topic_info()

## standard model, remove stopwords and n-gram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")

if os.path.exists('southsudan_model14'):
    bertopic_VM_NG = BERTopic.load('southsudan_model14')
else:
    bertopic_VM_NG = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, vectorizer_model=vectorizer_model, n_gram_range=(1, 3))
    bertopic_VM_NG.fit_transform(docs)
    bertopic_VM_NG.save("southsudan_model14")

bertopic_VM_NG.get_topic_info()

## standard model, remove stopwords and n-gram, diversify

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
vectorizer_model = CountVectorizer(stop_words="english")
representation_model = MaximalMarginalRelevance(diversity=0.7)

if os.path.exists('southsudan_model15'):
    bertopic_VM_NG_D = BERTopic.load('southsudan_model15')
else:
    bertopic_VM_NG_D = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, vectorizer_model=vectorizer_model, n_gram_range=(1, 3), representation_model=representation_model)
    bertopic_VM_NG_D.fit_transform(docs)
    bertopic_VM_NG_D.save("southsudan_model15")

bertopic_VM_NG_D.get_topic_info()

## remove stopwords and KM and n-gram, diversify

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
kmeans_model = KMeans(n_clusters=50, random_state=42)
representation_model = MaximalMarginalRelevance(diversity=0.7)

if os.path.exists('southsudan_model17'):
    bertopic_VM_KM_NG_D = BERTopic.load('southsudan_model17')
else:
    bertopic_VM_KM_NG_D = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model, vectorizer_model=vectorizer_model, hdbscan_model=kmeans_model, n_gram_range=(1, 3), representation_model=representation_model)
    bertopic_VM_KM_NG_D.fit_transform(docs)
    bertopic_VM_KM_NG_D.save("southsudan_model17")

bertopic_VM_KM_NG_D.get_topic_info()

## remove stopwords and PCA and n-gram, diversify

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
pca_model = PCA(random_state=42, n_components=150)
representation_model = MaximalMarginalRelevance(diversity=1)

if os.path.exists('southsudan_model20'):
    bertopic_VM_PCA_NG_D = BERTopic.load('southsudan_model20')
else:
    bertopic_VM_PCA_NG_D = BERTopic(language="english", calculate_probabilities=True, verbose=True, vectorizer_model=vectorizer_model, umap_model=pca_model, n_gram_range=(1, 3), representation_model=representation_model)
    bertopic_VM_PCA_NG_D.fit_transform(docs)
    bertopic_VM_PCA_NG_D.save("southsudan_model20")

bertopic_VM_PCA_NG_D.get_topic_info()

## wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Show wordcloud
create_wordcloud(bertopic_VM_NG_D, topic=3)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [None]:
bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [None]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

## create dfs for the different models

In [None]:
def topics_related_to_keywords(df, model, keywords_list, end_keyword):
    relevant_topics = get_relevant_topics(bertopic_model = model, keywords=keywords_list, top_n=10)

    topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs
    
    df[end_keyword] = [t in topic_ids for t in model.topics_] # Add boolean column to df if topic in list of relevant topics
    
    # View the Count, Name, Representation, and Representative Docs for the relevant topics
    model.get_topic_info().set_index('Topic').loc[topic_ids]
    return df

models_dict = {
    'bertopic_standard': bertopic_standard,
    'bertopic_PCA': bertopic_PCA,
    'bertopic_KM': bertopic_KM,
    'bertopic_PCA_KM': bertopic_PCA_KM,
    'bertopic_NG': bertopic_NG,
    'bertopic_PCA_NG': bertopic_PCA_NG,
    'bertopic_KM_NG': bertopic_KM_NG,
    'bertopic_PCA_KM_NG': bertopic_PCA_KM_NG,
    'bertopic_VM': bertopic_VM,
    'bertopic_VM_KM': bertopic_VM_KM,
    'bertopic_VM_KM_PCA': bertopic_VM_KM_PCA,
    'bertopic_VM_KM_NG': bertopic_VM_KM_NG,
    'bertopic_VM_KM_PCA_NG': bertopic_VM_KM_PCA_NG,
    'bertopic_VM_NG': bertopic_VM_NG, 
    'bertopic_VM_NG_D': bertopic_VM_NG_D, 
    'bertopic_VM_KM_NG_D': bertopic_VM_KM_NG_D,
    'bertopic_VM_PCA': bertopic_VM_PCA,
    'bertopic_VM_PCA_NG': bertopic_VM_PCA_NG,
    'bertopic_VM_PCA_NG_D': bertopic_VM_PCA_NG_D
}

keyw = {"hunger": ['hunger', 'food insecurity'], 
        "refugees": ['refugees', 'displaced'], 
        "humanitarian": ["humanitarian"], 
        "conflict": ['conflict', 'fighting', 'murder']}
keys_list = list(keyw.keys())

for name, mod in models_dict.items():
    df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])
    df_name = 'df_' + name
    for keyword, keywlist in keyw.items():
        df = topics_related_to_keywords(df, mod, keywlist, keyword)
    original_df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])
    df_merge = original_df.merge(df[['summary'] + keys_list], how="left", left_on="summary", right_on="summary")
    df_merge.to_csv("data/"+str(name)+".csv", index=False)

In [None]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic_standard, keywords=['hunger', 'food insecurity'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic_standard.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic_standard.get_topic_info().set_index('Topic').loc[topic_ids]

In [None]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic_standard, keywords=['refugees', 'displaced'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic_standard.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic_standard.get_topic_info().set_index('Topic').loc[topic_ids]

In [None]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic_standard, keywords=['humanitarian'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic_standard.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic_standard.get_topic_info().set_index('Topic').loc[topic_ids]

In [None]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic_standard, keywords=['conflict', 'fighting', 'murder'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic_standard.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic_standard.get_topic_info().set_index('Topic').loc[topic_ids]