In [1]:
%run -i "../util/util_simple_classifier.ipynb"
%run -i "../util/lang_utils.ipynb"

In [13]:
from nltk import word_tokenize
from sklearn.cluster import KMeans
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from joblib import dump, load

In [3]:
# Load dataset
train_dataset = load_dataset("SetFit/bbc-news", split="train")
test_dataset = load_dataset("SetFit/bbc-news", split="test")
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
print(train_df)
print(test_df)

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


                                                   text  label     label_text
0     wales want rugby league training wales could f...      2          sport
1     china aviation seeks rescue deal scandal-hit j...      1       business
2     rock band u2 break ticket record u2 have smash...      3  entertainment
3     markets signal brazilian recovery the brazilia...      1       business
4     tough rules for ringtone sellers firms that fl...      0           tech
...                                                 ...    ...            ...
1220  us economy shows solid gdp growth the us econo...      1       business
1221  microsoft releases bumper patches microsoft ha...      0           tech
1222  stuart joins norwich from addicks norwich have...      2          sport
1223  why few targets are better than many the econo...      1       business
1224  boothroyd calls for lords speaker betty boothr...      4       politics

[1225 rows x 3 columns]
                                       

In [4]:
# See the distribution of classes
print(train_df.groupby('label_text').count())
print(test_df.groupby('label_text').count())

               text  label
label_text                
business        286    286
entertainment   210    210
politics        242    242
sport           275    275
tech            212    212
               text  label
label_text                
business        224    224
entertainment   176    176
politics        175    175
sport           236    236
tech            189    189


In [5]:
# Combine train and test dataframes and create a better train/test split
combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
train_index, test_index = next(sss.split(combined_df["text"], combined_df["label"]))
train_df = combined_df[combined_df.index.isin(train_index)].copy()
test_df = combined_df[combined_df.index.isin(test_index)].copy()
print(train_df.groupby('label_text').count())
print(test_df.groupby('label_text').count())

               text  label
label_text                
business        408    408
entertainment   309    309
politics        333    333
sport           409    409
tech            321    321
               text  label
label_text                
business        102    102
entertainment    77     77
politics         84     84
sport           102    102
tech             80     80


In [6]:
# Preprocess the data
train_df = tokenize(train_df, "text")
train_df = remove_stopword_punct(train_df, "text_tokenized")
test_df = tokenize(test_df, "text")
test_df = remove_stopword_punct(test_df, "text_tokenized")
print(train_df)
print(test_df)

                                                   text  label     label_text  \
0     wales want rugby league training wales could f...      2          sport   
1     china aviation seeks rescue deal scandal-hit j...      1       business   
2     rock band u2 break ticket record u2 have smash...      3  entertainment   
3     markets signal brazilian recovery the brazilia...      1       business   
4     tough rules for ringtone sellers firms that fl...      0           tech   
...                                                 ...    ...            ...   
2217  soros group warns of kazakh close the open soc...      1       business   
2218  election  could be terror target  terrorists m...      4       politics   
2219  lifestyle  governs mobile choice  faster  bett...      0           tech   
2220  mobile multimedia slow to catch on there is no...      0           tech   
2221  owen determined to stay in madrid england forw...      2          sport   

                           

In [7]:
# Get the training data and create the vectorizer
train_df["text_clean"] = train_df["text_tokenized"].apply(lambda x: " ".join(list(x)))
test_df["text_clean"] = test_df["text_tokenized"].apply(lambda x: " ".join(list(x)))
train_df.to_json("../data/bbc_train.json")
test_df.to_json("../data/bbc_test.json")
vec = TfidfVectorizer(ngram_range=(1,3))
matrix = vec.fit_transform(train_df["text_clean"])

In [8]:
# Cluster the data
km = KMeans(n_clusters=5, n_init=10)
km.fit(matrix)

In [9]:
def get_most_frequent_words(text, num_words):
    word_list = word_tokenize(text)
    freq_dist = FreqDist(word_list)
    top_words = freq_dist.most_common(num_words)
    top_words = [word[0] for word in top_words]
    return top_words

In [10]:
def print_most_common_words_by_cluster(input_df, km, num_clusters):
    clusters = km.labels_.tolist()
    input_df["cluster"] = clusters
    for cluster in range(0, num_clusters):
        this_cluster_text = input_df[input_df['cluster'] == cluster]
        all_text = " ".join(this_cluster_text['text_clean'].astype(str))
        top_200 = get_most_frequent_words(all_text, 200)
        print(cluster)
        print(top_200)
    return input_df

In [11]:
print_most_common_words_by_cluster(train_df, km, 5)

0
['said', 'mr', 'would', 'us', 'year', 'government', 'also', 'new', 'people', 'could', 'last', 'one', 'uk', 'market', 'years', 'growth', 'company', '000', 'economy', 'two', 'told', 'world', 'bank', 'may', 'economic', 'sales', 'time', 'however', 'first', 'minister', 'country', 'european', 'made', 'firm', 'chief', 'law', 'public', 'many', 'deal', '2004', 'expected', 'eu', 'china', 'three', 'since', 'business', 'next', 'prices', 'says', 'work', 'say', 'group', 'make', 'months', 'london', 'added', 'shares', 'plans', 'still', 'party', 'countries', 'executive', 'foreign', 'report', 'take', 'rise', 'lord', 'set', 'bbc', 'companies', 'much', 'news', 'back', 'week', 'oil', 'state', 'financial', 'figures', 'europe', 'firms', 'general', 'british', 'december', 'well', 'secretary', 'police', 'home', 'jobs', 'president', 'part', 'get', 'spending', 'spokesman', 'increase', 'trade', 'rate', 'stock', 'interest', 'blair', 'analysts', 'move', 'budget', 'rights', 'way', 'money', 'dollar', 'decision', 'ho

Unnamed: 0,text,label,label_text,text_tokenized,text_clean,cluster
0,wales want rugby league training wales could f...,2,sport,"[wales, want, rugby, league, training, wales, ...",wales want rugby league training wales could f...,1
1,china aviation seeks rescue deal scandal-hit j...,1,business,"[china, aviation, seeks, rescue, deal, scandal...",china aviation seeks rescue deal scandal-hit j...,0
2,rock band u2 break ticket record u2 have smash...,3,entertainment,"[rock, band, u2, break, ticket, record, u2, sm...",rock band u2 break ticket record u2 smashed ir...,2
3,markets signal brazilian recovery the brazilia...,1,business,"[markets, signal, brazilian, recovery, brazili...",markets signal brazilian recovery brazilian st...,0
4,tough rules for ringtone sellers firms that fl...,0,tech,"[tough, rules, ringtone, sellers, firms, flout...",tough rules ringtone sellers firms flout rules...,4
...,...,...,...,...,...,...
2217,soros group warns of kazakh close the open soc...,1,business,"[soros, group, warns, kazakh, close, open, soc...",soros group warns kazakh close open society in...,0
2218,election could be terror target terrorists m...,4,politics,"[election, could, terror, target, terrorists, ...",election could terror target terrorists might ...,3
2219,lifestyle governs mobile choice faster bett...,0,tech,"[lifestyle, governs, mobile, choice, faster, b...",lifestyle governs mobile choice faster better ...,4
2220,mobile multimedia slow to catch on there is no...,0,tech,"[mobile, multimedia, slow, catch, doubt, mobil...",mobile multimedia slow catch doubt mobile phon...,4


In [12]:
test_example = test_df.iloc[1, test_df.columns.get_loc('text')]
print(test_example)
vectorized = vec.transform([test_example])
prediction = km.predict(vectorized)
print(prediction)

lib dems  new election pr chief the lib dems have appointed a senior figure from bt to be the party s new communications chief for their next general election effort.  sandy walkington will now work with senior figures such as matthew taylor on completing the party manifesto. party chief executive lord rennard said the appointment was a  significant strengthening of the lib dem team . mr walkington said he wanted the party to be ready for any  mischief  rivals or the media tried to throw at it.   my role will be to ensure this new public profile is effectively communicated at all levels   he said.  i also know the party will be put under scrutiny in the media and from the other parties as never before - and we will need to show ourselves ready and prepared to counter the mischief and misrepresentation that all too often comes from the party s opponents.  the party is already demonstrating on every issue that it is the effective opposition.  mr walkington s new job title is director of 

In [17]:
dump(km, '../data/kmeans.joblib')
km_ = load('../data/kmeans.joblib')
prediction = km_.predict(vectorized)
print(prediction)

[3]
