In [1]:
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report

In [2]:
%run -i "../util/lang_utils.ipynb"

In [3]:
bbc_df = pd.read_csv("../data/bbc-text.csv")
print(bbc_df)

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


In [4]:
bbc_train, bbc_test = train_test_split(bbc_df, test_size=0.1)
print(len(bbc_train))
print(len(bbc_test))

2002
223


In [5]:
documents = bbc_train['text'].values
#model = SentenceTransformer('distilbert-base-nli-mean-tokens')
#'all-MiniLM-L6-v2'
model = SentenceTransformer('all-MiniLM-L6-v2')
encoded_data = model.encode(documents)
km = KMeans(n_clusters=5, n_init='auto', init='k-means++')
km.fit(encoded_data)

In [6]:
print_most_common_words_by_cluster(documents, km, 5)

0
['said', 'people', 'new', 'also', 'mr', 'technology', 'would', 'one', 'mobile', 'could', 'users', 'music', 'use', 'software', 'us', 'net', 'digital', 'many', 'games', 'year', 'phone', 'uk', 'make', 'like', 'computer', 'service', 'get', 'world', 'time', 'online', 'internet', 'used', 'way', 'microsoft', 'broadband', 'tv', 'information', 'video', 'data', 'million', 'first', 'using', 'system', 'services', 'phones', 'security', 'two', 'number', 'says', 'work', 'firms', 'search', 'market', 'companies', 'industry', 'firm', 'content', 'game', 'according', 'last', 'media', 'much', 'networks', 'next', 'access', 'news', 'take', 'apple', 'research', 'network', 'around', 'web', 'pc', 'want', 'well', 'years', 'home', 'even', 'already', 'help', 'sony', 'made', 'company', 'going', 'site', 'sites', 'e-mail', 'see', 'players', 'devices', 'website', 'set', 'bbc', 'bt', 'different', 'radio', 'need', 'go', 'customers', 'consumers', 'found', 'able', 'europe', 'show', 'part', 'end', 'gaming', 'months', 'av

In [7]:
bbc_test["prediction"] = bbc_test["text"].apply(lambda x: km.predict(model.encode([x]))[0])
print(bbc_test)

           category                                               text  \
578           sport  isinbayeva heads for birmingham olympic pole v...   
2012  entertainment  boogeyman takes box office lead the low-budget...   
208   entertainment  public show for reynolds portrait sir joshua r...   
1078       politics  stalemate in pension strike talks talks aimed ...   
1229          sport  newcastle 2-1 bolton kieron dyer smashed home ...   
...             ...                                                ...   
1977       politics  row over  police  power for csos the police fe...   
1218       politics  whitehall cuts  ahead of target  thousands of ...   
880   entertainment  roundabout continues nostalgia trip the new bi...   
1552           tech  attack prompts bush site block the official re...   
278        business  soaring oil  hits world economy  the soaring c...   

      prediction  
578            1  
2012           2  
208            2  
1078           3  
1229           1

In [16]:
topic_mapping = {0:"tech", 1:"sport", 2:"entertainment", 3:"politics", 4:"business"}
bbc_test["pred_category"] = bbc_test["prediction"].apply(lambda x: topic_mapping[x])
print(classification_report(bbc_test["category"], bbc_test["pred_category"]))

               precision    recall  f1-score   support

     business       0.74      0.60      0.66        47
entertainment       0.43      0.57      0.49        46
     politics       0.73      0.77      0.75        31
        sport       0.37      0.41      0.39        49
         tech       0.87      0.66      0.75        50

     accuracy                           0.59       223
    macro avg       0.63      0.60      0.61       223
 weighted avg       0.62      0.59      0.60       223



In [17]:
new_example = """Manchester United players slumped to the turf 
at full-time in Germany on Tuesday in acknowledgement of what their 
latest pedestrian first-half display had cost them. The 3-2 loss at 
RB Leipzig means United will not be one of the 16 teams in the draw 
for the knockout stages of the Champions League. And this is not the 
only price for failure. The damage will be felt in the accounts, in 
the dealings they have with current and potentially future players 
and in the faith the fans have placed in manager Ole Gunnar Solskjaer. 
With Paul Pogba's agent angling for a move for his client and ex-United 
defender Phil Neville speaking of a "witchhunt" against his former team-mate 
Solskjaer, BBC Sport looks at the ramifications and reaction to a big loss for United."""

In [18]:
predictions = km.predict(model.encode([new_example]))
print(predictions[0])

3
