In [25]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
%run -i "../util/lang_utils.ipynb"

In [3]:
stop_words = stopwords.words('english')
stop_words.append("said")
stop_words.append("mr")
bbc_df = pd.read_csv("../data/bbc-text.csv")

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


In [4]:
bbc_df["text"] = bbc_df["text"].apply(lambda x: word_tokenize(x))
bbc_df["text"] = bbc_df["text"].apply(lambda x: [w for w in x if w not in stop_words])
bbc_df["text"] = bbc_df["text"].apply(lambda x: " ".join(x))

In [5]:
bbc_train, bbc_test = train_test_split(bbc_df, test_size=0.1)
print(len(bbc_train))
print(len(bbc_test))

2002
223


In [6]:
docs = bbc_train["text"].values

In [7]:
topic_model = BERTopic(nr_topics=6)
topics, probs = topic_model.fit_transform(docs)

In [8]:
print(topic_model.get_topic_info())

   Topic  Count                                 Name  \
0     -1    222             -1_also_company_china_us   
1      0    463             0_england_game_win_first   
2      1    393      1_would_labour_government_blair   
3      2    321             2_film_best_music_awards   
4      3    309  3_people_mobile_technology_software   
5      4    294             4_us_year_growth_economy   

                                      Representation  \
0  [also, company, china, us, would, year, new, p...   
1  [england, game, win, first, club, world, playe...   
2  [would, labour, government, blair, election, p...   
3  [film, best, music, awards, show, year, band, ...   
4  [people, mobile, technology, software, digital...   
5  [us, year, growth, economy, economic, company,...   

                                 Representative_Docs  
0  [us retail sales surge december us retail sale...  
1  [ireland win eclipses refereeing errors intern...  
2  [lib dems unveil election slogan liberal democ

In [9]:
print(topic_model.get_topic(0))

[('england', 0.023923609275000306), ('game', 0.023874910540888444), ('win', 0.02089139078895572), ('first', 0.019051267767033135), ('club', 0.017470682428724963), ('world', 0.017352529044283988), ('players', 0.016899062892940703), ('cup', 0.016893627391621816), ('last', 0.01665891773908297), ('two', 0.016416826185772164)]


In [10]:
print(topic_model.get_topic(1))

[('would', 0.035433209961128954), ('labour', 0.03291644460816451), ('government', 0.02936700096809325), ('blair', 0.02702878364855685), ('election', 0.026944432307687366), ('party', 0.02566497689534048), ('people', 0.023100026902541076), ('brown', 0.02021005312555692), ('minister', 0.020034108662757368), ('also', 0.01606611985845893)]


In [11]:
print(topic_model.get_topic(2))

[('film', 0.04996542271015715), ('best', 0.03954338090393644), ('music', 0.02608204595923062), ('awards', 0.022660883747149464), ('show', 0.019911582970118826), ('year', 0.019569114192597062), ('band', 0.01947269693504342), ('also', 0.01936668159798611), ('award', 0.019271695654981706), ('one', 0.018568435901997235)]


In [12]:
print(topic_model.get_topic(3))

[('people', 0.029907130205424383), ('mobile', 0.024988214218395956), ('technology', 0.02246648997447284), ('software', 0.01907466920671406), ('digital', 0.01846667229083264), ('music', 0.018089961948297602), ('users', 0.01779304168540416), ('one', 0.017635710770530333), ('also', 0.017408837514157897), ('new', 0.01738824388413855)]


In [13]:
print(topic_model.get_topic(4))

[('us', 0.03584149650622307), ('year', 0.02528335130307409), ('growth', 0.024390583399937417), ('economy', 0.021929580756366116), ('economic', 0.019792138393691337), ('company', 0.019580005023360762), ('yukos', 0.018906513989042615), ('market', 0.01801654052491897), ('oil', 0.017884696942644544), ('firm', 0.01642415825482552)]


In [31]:
topic_model.generate_topic_labels(nr_words=5, topic_prefix=True, separator='_')

['-1_also_company_china_us_would',
 '0_england_game_win_first_club',
 '1_would_labour_government_blair_election',
 '2_film_best_music_awards_show',
 '3_people_mobile_technology_software_digital',
 '4_us_year_growth_economy_economic']

In [16]:
def get_prediction(input_text, model):
    pred = model.transform(input_text)
    pred = pred[0][0]
    return pred

In [17]:
bbc_test["prediction"] = bbc_test["text"].apply(lambda x: get_prediction(x, topic_model))
topic_mapping = {0:"sport", 1:"politics", 2:"entertainment", 3:"tech", 4:"business", -1:"discard"}

In [26]:
bbc_test["pred_category"] = bbc_test["prediction"].apply(lambda x: topic_mapping[x])
test_data = bbc_test.loc[bbc_test['prediction'] != -1]
print(classification_report(test_data["category"], test_data["pred_category"]))

               precision    recall  f1-score   support

     business       0.95      0.86      0.90        21
entertainment       0.97      1.00      0.98        30
     politics       0.94      1.00      0.97        46
        sport       1.00      1.00      1.00        62
         tech       0.96      0.88      0.92        25

     accuracy                           0.97       184
    macro avg       0.96      0.95      0.95       184
 weighted avg       0.97      0.97      0.97       184



In [27]:
new_input = bbc_test["text"].iloc[0]
print(new_input)

howard dismisses tory tax fears michael howard dismissed fears conservatives plans £4bn tax cuts modest . defended package saying plan tories first budget hoped able go . tories monday highlighted £35bn wasteful spending would stop allow tax cuts reduced borrowing spending key services . labour liberal democrats say party sums add claim would cut frontline services . tory tax plan follows complaints party mps howard shadow chancellor oliver letwin taken long unveil proposals . promised figure yet reveal taxes would targeted . tory backbencher edward leigh proposals step right direction told financial times : would come sooner much greater tax cuts . interviewed bbc radio 2 jeremy vine show howard : perfectly true attacked one side people think ought promising much much bigger tax cuts spending cuts . side people say able achieve tax cuts . think got right . howard voters faced clear choice next election waste tax labour tory value money lower taxes . added : would like able time sure a

In [28]:
print(topic_model.transform(new_input))

([1], array([1.]))


In [14]:
topics, similarity = topic_model.find_topics("sports", top_n=5)
sim_topics = list(zip(topics, similarity))
print(sim_topics)

[(0, 0.29033981040460977), (3, 0.049293092462828376), (-1, -0.0047265937178774895), (2, -0.02074380026102955), (4, -0.03699168959416969)]


In [32]:
topics, similarity = topic_model.find_topics("business and economics", top_n=5)
sim_topics = list(zip(topics, similarity))
print(sim_topics)

[(4, 0.29003573983158404), (-1, 0.26259758927249205), (3, 0.15627005753581313), (1, 0.05491237184012845), (0, 0.010567363445904386)]


In [33]:
input_text = """YouTube removed a snippet of code that publicly disclosed whether a channel receives ad payouts, 
obscuring which creators benefit most from the platform."""
topics, similarity = topic_model.find_topics(input_text, top_n=5)
sim_topics = list(zip(topics, similarity))
print(sim_topics)

[(3, 0.2540850599909866), (-1, 0.172097560474608), (2, 0.1367798346494483), (4, 0.10243553209139492), (1, 0.06954579004136925)]
