In [4]:
import pandas as pd 
import joblib
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from skmultilearn.problem_transform import BinaryRelevance
from xgboost import XGBClassifier



In [5]:
# loading the X, y, vectoriser

X = joblib.load("data/X.pkl")
y = joblib.load("data/y.pkl")



df = pd.read_csv("data/cleaned_df.csv")

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# stop words

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectoriser = TfidfVectorizer(lowercase=True, stop_words=list(stop_words)) 

In [3]:
X.text_cleaned

0       War in Afghanistan  2001 2021    2021 Afghanis...
1       Russo Ukrainian War   2022 Russian invasion of...
2       In  India   a new  income tax  law comes into ...
3       Israel Hamas war   World Central Kitchen drone...
4       Middle Eastern crisis   Gaza war   Israeli inv...
                              ...                        
1689    Afghanistan conflict   2021 evacuation from Af...
1690    Russo Ukrainian War   2022 Russian invasion of...
1691    Russian invasion of Ukraine   Zaporizhzhia Nuc...
1692    Russian invasion of Ukraine   Eastern Ukraine ...
1693    Middle Eastern crisis   Gaza war   2025 Gaza C...
Name: text_cleaned, Length: 1694, dtype: object

In [8]:
topic_hashmap = joblib.load("data/topic_hashmap.pkl")

In [9]:
Y = pd.DataFrame(y, columns=topic_hashmap.keys())

In [10]:
X

Unnamed: 0,text_cleaned
0,War in Afghanistan 2001 2021 2021 Afghanis...
1,Russo Ukrainian War 2022 Russian invasion of...
2,In India a new income tax law comes into ...
3,Israel Hamas war World Central Kitchen drone...
4,Middle Eastern crisis Gaza war Israeli inv...
...,...
1689,Afghanistan conflict 2021 evacuation from Af...
1690,Russo Ukrainian War 2022 Russian invasion of...
1691,Russian invasion of Ukraine Zaporizhzhia Nuc...
1692,Russian invasion of Ukraine Eastern Ukraine ...


In [11]:
df

Unnamed: 0.1,Unnamed: 0,date,text_cleaned,topic_split
0,0,"April 1, 2021",War in Afghanistan 2001 2021 2021 Afghanis...,"armed conflicts and attacks,arts and culture,b..."
1,1,"April 1, 2022",Russo Ukrainian War 2022 Russian invasion of...,"armed conflicts and attacks,disasters and acci..."
2,2,"April 1, 2023",In India a new income tax law comes into ...,"business and economics,disasters and accidents..."
3,3,"April 1, 2024",Israel Hamas war World Central Kitchen drone...,"armed conflicts and attacks,disasters and acci..."
4,4,"April 1, 2025",Middle Eastern crisis Gaza war Israeli inv...,"armed conflicts and attacks,disasters and acci..."
...,...,...,...,...
1689,1689,"September 9, 2021",Afghanistan conflict 2021 evacuation from Af...,"armed conflicts and attacks,arts and culture,b..."
1690,1690,"September 9, 2022",Russo Ukrainian War 2022 Russian invasion of...,"armed conflicts and attacks,disasters and acci..."
1691,1691,"September 9, 2023",Russian invasion of Ukraine Zaporizhzhia Nuc...,"armed conflicts and attacks,arts and culture,l..."
1692,1692,"September 9, 2024",Russian invasion of Ukraine Eastern Ukraine ...,"armed conflicts and attacks,business and econo..."


In [12]:
y

array([[1, 0, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [13]:
# splitting




# splitting
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)


X_train_vec = vectoriser.fit_transform(X_train.text_cleaned) 
X_test_vec = vectoriser.transform(X_test.text_cleaned) 




In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from collections import Counter

def mlsmote(X, y, k=5):
    X = np.array(X)
    y = np.array(y)

    # minority labels
    label_counts = y.sum(axis=0)
    minority_labels = np.where(label_counts < np.median(label_counts))[0]

    synthetic_X = []
    synthetic_y = []

    for label in minority_labels:
        idx = np.where(y[:, label] == 1)[0]
        if len(idx) <= 1: continue

        nbrs = NearestNeighbors(n_neighbors=min(k, len(idx))).fit(X[idx])
        distances, indices = nbrs.kneighbors(X[idx])

        for i, row_indices in enumerate(indices):
            # pick a random neighbor
            nn = np.random.choice(row_indices[1:])
            diff = X[idx][nn] - X[idx][i]
            gap = np.random.rand()
            new_sample = X[idx][i] + gap * diff

            # label synthesis
            neighbor_labels = y[idx][row_indices]
            new_labels = (neighbor_labels.sum(axis=0) >= (len(row_indices)/2)).astype(int)

            synthetic_X.append(new_sample)
            synthetic_y.append(new_labels)

    return np.vstack([X, synthetic_X]), np.vstack([y, synthetic_y])


In [42]:
X_train_vec

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 375469 stored elements and shape (1355, 31962)>

In [44]:
X_train_vec_res, Y_train_vec_res = mlsmote(X_train_vec.toarray(), Y_train, k=10)

In [45]:
X_train_vec_res

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.00435089, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01883385, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00176268, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [124]:
from sklearn.tree import DecisionTreeClassifier

clf2_res = DecisionTreeClassifier(max_depth=100, class_weight='balanced') 
clf2_res = BinaryRelevance(clf2_res) 
clf2_res.fit(X_train_vec_res, Y_train_vec_res)

0,1,2
,classifier,DecisionTreeC...max_depth=100)
,require_dense,"[True, True]"

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [113]:
from sklearn.tree import DecisionTreeClassifier

clf2 = DecisionTreeClassifier(max_depth=100, class_weight='balanced') 
clf2 = BinaryRelevance(clf2) 
clf2.fit(X_train_vec, Y_train)

0,1,2
,classifier,DecisionTreeC...max_depth=100)
,require_dense,"[True, True]"

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [119]:
from sklearn.ensemble import RandomForestClassifier
clf_res = RandomForestClassifier(max_depth=100, class_weight='balanced') 
clf_res = BinaryRelevance(clf_res) 
clf_res.fit(X_train_vec_res, Y_train_vec_res) 


0,1,2
,classifier,RandomForestC...max_depth=100)
,require_dense,"[True, True]"

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=100, class_weight='balanced') 
clf = BinaryRelevance(clf) 
clf.fit(X_train_vec, Y_train) 


0,1,2
,classifier,RandomForestC...max_depth=100)
,require_dense,"[True, True]"

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# METRICS

All of the models use binary relevance

In [15]:
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score

In [16]:
def metrics(Y_test, predictions):
    print (f"accuracy: {accuracy_score(Y_test, predictions)}")
    print ("---------------------------")
    print (f"macro f1 score: {f1_score(Y_test, predictions, average='macro')}")
    print (f"macro recall score: {recall_score(Y_test, predictions, average='macro')}")
    print (f"macro precision score: {precision_score(Y_test, predictions, average='macro')}")
    print ("---------------------------")
    print (f"weighted f1 score: {f1_score(Y_test, predictions, average='weighted')}")
    print (f"Weighted recall score: {recall_score(Y_test, predictions, average='weighted')}")
    print (f"weighted precision score: {precision_score(Y_test, predictions, average='weighted')}")
    print ("--------------------")
    print (f"Sampled f1 score: {f1_score(Y_test, predictions, average='samples')}")
    

## without resampling

In [123]:
decision_tree_predictions = clf2.predict(X_test_vec) 
metrics(Y_test, decision_tree_predictions)

accuracy: 0.0058997050147492625
---------------------------
macro f1 score: 0.3934591860347155
macro recall score: 0.3941394227666702
macro precision score: 0.3934691595096404
---------------------------
weighted f1 score: 0.6551546939821111
Weighted recall score: 0.6532091097308489
weighted precision score: 0.657740607338854
--------------------
Sampled f1 score: 0.6444585958510998


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
random_forest_predictions_res = clf.predict(X_test_vec) 
metrics(Y_test, random_forest_predictions_res)

accuracy: 0.038348082595870206
---------------------------
macro f1 score: 0.36692915013112465
macro recall score: 0.3948162985899924
macro precision score: 0.4799470312813157
---------------------------
weighted f1 score: 0.6637033887084243
Weighted recall score: 0.7243159525038719
weighted precision score: 0.7318735458683183
--------------------
Sampled f1 score: 0.7394761749134473


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [18]:
joblib.dump(clf, 'data/news_topic_classifier.pkl')
joblib.dump(vectoriser, 'data/tfidf_vectoriser.pkl')

['data/tfidf_vectoriser.pkl']


accuracy: 0.02654867256637168
---------------------------
macro f1 score: 0.37271947709232095
macro recall score: 0.40884176854288806
macro precision score: 0.38780175058395355
---------------------------
weighted f1 score: 0.6796543880116974
Weighted recall score: 0.7494824016563147
weighted precision score: 0.6658341333683897
--------------------
Sampled f1 score: 0.7486325703139862

In [2]:

joblib.dump(vectoriser, 'data/tfidf_vectoriser.pkl')

NameError: name 'joblib' is not defined

In [127]:
joblib.dump(clf, 'data/news_topic_classifier.pkl')

['data/news_topic_classifier.pkl']

## with resampling...

In [125]:
decision_tree_predictions_res = clf2_res.predict(X_test_vec) 
metrics(Y_test, decision_tree_predictions_res)

accuracy: 0.017699115044247787
---------------------------
macro f1 score: 0.3979157114044642
macro recall score: 0.40085435773624195
macro precision score: 0.3957516345007832
---------------------------
weighted f1 score: 0.659145216084794
Weighted recall score: 0.6589026915113871
weighted precision score: 0.6604566190321148
--------------------
Sampled f1 score: 0.642781358268084


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [122]:
# this is for xgboost, depth=100, eta=0.1
random_forest_predictions_res = clf_res.predict(X_test_vec) 
metrics(Y_test, random_forest_predictions_res)

accuracy: 0.02654867256637168
---------------------------
macro f1 score: 0.37271947709232095
macro recall score: 0.40884176854288806
macro precision score: 0.38780175058395355
---------------------------
weighted f1 score: 0.6796543880116974
Weighted recall score: 0.7494824016563147
weighted precision score: 0.6658341333683897
--------------------
Sampled f1 score: 0.7486325703139862


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
y

array([[1, 0, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [20]:
X

Unnamed: 0,text_cleaned
0,War in Afghanistan 2001 2021 2021 Afghanis...
1,Russo Ukrainian War 2022 Russian invasion of...
2,In India a new income tax law comes into ...
3,Israel Hamas war World Central Kitchen drone...
4,Middle Eastern crisis Gaza war Israeli inv...
...,...
1689,Afghanistan conflict 2021 evacuation from Af...
1690,Russo Ukrainian War 2022 Russian invasion of...
1691,Russian invasion of Ukraine Zaporizhzhia Nuc...
1692,Russian invasion of Ukraine Eastern Ukraine ...


In [21]:
topic_hashmap

{'armed conflicts and attacks': 0,
 'arts and culture': 1,
 'disasters and accidents': 2,
 'international relations': 3,
 'law and crime and politics': 4,
 'politics and elections and economics': 5,
 'science and technology': 6,
 'sports': 7,
 'other current events': 8,
 'business and economics': 9,
 'religion and politics': 10,
 'entertainment': 11,
 'health and environment': 12,
 'culture and entertainment': 13}

In [23]:
clean_df = pd.read_csv("data/clean_df.csv") 
cleaned_df = pd.read_csv("data/cleaned_df.csv")

In [89]:
joblib.dump(X_train_vec, 'data/X_train_vec.pkl')

['data/X_train_vec.pkl']

In [31]:
cleaned_df.topic_split.iloc[0]

'armed conflicts and attacks,arts and culture,business and economics,health and environment,international relations,law and crime and politics,politics and elections and economics,armed conflicts and attacks,arts and culture,business and economics,health and environment,international relations,law and crime and politics,politics and elections and economics,armed conflicts and attacks,arts and culture,business and economics,health and environment,international relations,law and crime and politics,politics and elections and economics,armed conflicts and attacks,arts and culture,business and economics,health and environment,international relations,law and crime and politics,politics and elections and economics,armed conflicts and attacks,arts and culture,business and economics,health and environment,international relations,law and crime and politics,politics and elections and economics,armed conflicts and attacks,arts and culture,business and economics,health and environment,international

In [87]:
y

array([[1, 0, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [27]:
X

Unnamed: 0,text_cleaned
0,War in Afghanistan 2001 2021 2021 Afghanis...
1,Russo Ukrainian War 2022 Russian invasion of...
2,In India a new income tax law comes into ...
3,Israel Hamas war World Central Kitchen drone...
4,Middle Eastern crisis Gaza war Israeli inv...
...,...
1689,Afghanistan conflict 2021 evacuation from Af...
1690,Russo Ukrainian War 2022 Russian invasion of...
1691,Russian invasion of Ukraine Zaporizhzhia Nuc...
1692,Russian invasion of Ukraine Eastern Ukraine ...


In [32]:
clf.get_params

<bound method MLClassifierBase.get_params of BinaryRelevance(classifier=RandomForestClassifier(class_weight='balanced',
                                                  max_depth=100),
                require_dense=[True, True])>

In [33]:
Y

Unnamed: 0,armed conflicts and attacks,arts and culture,disasters and accidents,international relations,law and crime and politics,politics and elections and economics,science and technology,sports,other current events,business and economics,religion and politics,entertainment,health and environment,culture and entertainment
0,1,0,1,0,1,0,0,0,0,0,1,0,0,1
1,1,1,1,0,1,0,0,1,1,0,1,0,1,1
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,1,0,1,0,1,0,1,0,0,1
4,1,0,0,0,1,0,0,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1689,1,0,1,0,1,0,1,1,1,0,1,0,0,1
1690,1,0,0,0,1,0,0,1,1,0,1,0,1,0
1691,1,0,0,0,1,0,1,1,1,0,1,0,0,0
1692,1,0,0,0,0,0,1,1,1,0,0,0,0,0


In [34]:
y

array([[1, 0, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [38]:
import numpy as np
np.unique(np.array(Y) == y)

array([ True])

In [86]:
cleaned_df

Unnamed: 0.1,Unnamed: 0,date,text_cleaned,topic_split
0,0,"April 1, 2021",War in Afghanistan 2001 2021 2021 Afghanis...,"armed conflicts and attacks,arts and culture,b..."
1,1,"April 1, 2022",Russo Ukrainian War 2022 Russian invasion of...,"armed conflicts and attacks,disasters and acci..."
2,2,"April 1, 2023",In India a new income tax law comes into ...,"business and economics,disasters and accidents..."
3,3,"April 1, 2024",Israel Hamas war World Central Kitchen drone...,"armed conflicts and attacks,disasters and acci..."
4,4,"April 1, 2025",Middle Eastern crisis Gaza war Israeli inv...,"armed conflicts and attacks,disasters and acci..."
...,...,...,...,...
1689,1689,"September 9, 2021",Afghanistan conflict 2021 evacuation from Af...,"armed conflicts and attacks,arts and culture,b..."
1690,1690,"September 9, 2022",Russo Ukrainian War 2022 Russian invasion of...,"armed conflicts and attacks,disasters and acci..."
1691,1691,"September 9, 2023",Russian invasion of Ukraine Zaporizhzhia Nuc...,"armed conflicts and attacks,arts and culture,l..."
1692,1692,"September 9, 2024",Russian invasion of Ukraine Eastern Ukraine ...,"armed conflicts and attacks,business and econo..."


In [40]:
topic_hashmap

{'armed conflicts and attacks': 0,
 'arts and culture': 1,
 'disasters and accidents': 2,
 'international relations': 3,
 'law and crime and politics': 4,
 'politics and elections and economics': 5,
 'science and technology': 6,
 'sports': 7,
 'other current events': 8,
 'business and economics': 9,
 'religion and politics': 10,
 'entertainment': 11,
 'health and environment': 12,
 'culture and entertainment': 13}

In [41]:
y

array([[1, 0, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [43]:
pd.melt(Y, colu=['topic', 'presence'])

TypeError: melt() got an unexpected keyword argument 'columns'

# testing the model

In [55]:
from classify_news import classifier

In [56]:
text="""Elections for newly-created mayors will be delayed in four more areas of England, the BBC has confirmed, pushing back regional devolution plans.

New mayors were expected to be elected in Greater Essex, Norfolk & Suffolk, Hampshire & the Solent, and Sussex & Brighton in May next year, but will now have to wait until May 2028.

The government is arguing that more time is needed to reorganise local government in these areas, but stressed more homes, better high streets and support for business were "all on the way".

But opposition parties are calling for the elections to go ahead as planned, with shadow local government secretary Sir James Cleverly accusing Labour of "subverting democracy".

What is devolution and how does it work in the UK?
Published
9 July 2024
Rayner promises mayor for every region of England
Published
16 December 2024
Cheshire devolution: what happens next?
Published
24 September
The new mayors are part of a simplification of local government, aimed at reducing the number of councils, by merging district and county authorities to create unitary authorities.

The unitary authorities will be headed up by new mayors, who will be handed more funding and extra powers to run their area, intended to hand greater power to local communities.

However, the body representing district councils warned at the time that the plans could spark "turmoil" and argued "mega-councils" could undermine local decision-making.

Earlier this year, 18 councils requested a delay to their planned May elections due to incomplete reorganisation into eight unitary authorities - and critics are concerned the announcement of delays to mayoral elections could lead to further elections being cancelled.

Conservative leader Kemi Badenoch has said it is a "scandal" that local elections are being delayed.

The "excuse that it's about local government reorganisations doesn't wash", she told broadcasters on a visit to a school in London, adding the government has had "plenty of time to do this" and people "need their democracy".

Reform UK deputy leader Richard Tice called the move a "deliberate, dictatorial cancelling of democracy".

"There is just a fear of how successful Reform are doing, they've been talking about these mayoral elections for years and years, they've been getting ready," he told BBC Radio 4's Today programme.

Meanwhile, Liberal Democrat local government spokesperson Zoe Franklin said the party would work to see the vote next May does go ahead, adding, "democracy delayed is democracy denied".

And the Conservative candidate for Hampshire and the Solent called the decision to delay them a "disgrace and an affront to democracy".

"It is clear Labour are afraid to face the British public at the ballot box," Donna Jones said.

Green Party deputy leader and mayoral candidate for Sussex & Brighton, Rachel Millward, said people had the right to vote for mayors the government has promised them.

"Labour's whole devolution agenda has fallen into utter chaos," she said.

"They're running scared of the voters and failing to solve the real problems in local government."

Back in February, then Deputy Prime Minister Angela Rayner announced "ambitious" plans for six new areas of regional devolution, where councils would join forces to create combined authorities, led by new mayors.

The new Devolution Priority Programme areas were intended to provide "sweeping" powers for councils to fast-track growth in regions outside London and, at the time, the government said the new positions would be created "at pace", within a year.

Now, all six of those new devolution priority programme (DPP) areas have announced they will delay their mayoral elections, originally planned for May 2026.

The Ministry of Housing, Communities and Local Government (MHCLG) announced the first mayoral election in Cumbria had been pushed back a year in July, with councils saying combining the mayoral with planned local elections in 2027 would "save significant resources".

Cheshire & Warrington followed suit in September, pushing back its first mayoral election from to May 2027 to fit in with full council elections beinhg held at the same time.

An MCHLG spokesperson said unitary authorities are now in place in Cumbria and Cheshire and Warrington, which they said would "create simpler, more effective structures that can better support mayors' powers".

This will allow them to hold their inaugural mayoral elections in May 2027 and ministers are working with both areas to bring forward the legislation to create their mayoral authorities.

The remaining four areas were still on track to establish "stronger" unitary authorities in 2028, they said, before the inaugural mayoral elections are held in May 2028.

Devolution Minister Miatta Fahnbulleh stressed that work was continuing behind the scenes to ensure that almost £200m each year could be shared by the six areas over the next 30 years for local priorities in areas like transport, planning and skills.

An urgent question on the issue was lodged at parliament by Conservative MP and shadow local government minister David Simmonds, who said the minister had caused "a huge waste of public money for elections we are all ready for".

In light of the mayoral delay, Simmonds asked whether council elections planned for next May could also be delayed.

"Elections are happening in 2026, we are cracking on with it," Fahnbulleh replied, insisting "we have been absolutely consistent" on local elections.

On mayoral elections, she said it was "absolutely right" for the government to "take stock" of how far along in the process of reorganisation areas were but, "huge progress" had already been made on establishing the new authorities.

The MCHLG has announced that all six areas will get an additional £1m in the coming months, to help with the costs of establishing the new mayoral authorities, and will share £3m each as a minimum over the next three financial years.

"""

In [57]:
a = classifier(text=text)

text tfidf: <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 287 stored elements and shape (1, 32018)>
  Coords	Values
  (0, 228)	0.015024540154118585
  (0, 266)	0.014772873726486833
  (0, 427)	0.023369862922396022
  (0, 429)	0.0519185587881003
  (0, 430)	0.0908337693813621
  (0, 431)	0.0933461489827459
  (0, 501)	0.009844317821844036
  (0, 594)	0.014261179501735843
  (0, 722)	0.040849538229285054
  (0, 1587)	0.07729350710341987
  (0, 1674)	0.021381227759212206
  (0, 1774)	0.05844624971872149
  (0, 1777)	0.020291036952745943
  (0, 1955)	0.037083852950554407
  (0, 2016)	0.03267828359540416
  (0, 2070)	0.040099084570408984
  (0, 2112)	0.02393463315021571
  (0, 2388)	0.018790225432849236
  (0, 2403)	0.02485318012679565
  (0, 2411)	0.019178443081637082
  (0, 2425)	0.023045677774526895
  (0, 2429)	0.010237216845740311
  (0, 2689)	0.028912598316673505
  (0, 2753)	0.07950085984135143
  (0, 2754)	0.02485318012679565
  :	:
  (0, 28935)	0.01628385700647384
  (0, 28959)	0.05668040501

In [58]:
a.predict()

Unnamed: 0,armed conflicts and attacks,law and crime and politics,science and technology,sports,other current events,religion and politics
0,1,1,1,1,1,1


In [59]:
a.predict_proba()

Unnamed: 0,armed conflicts and attacks,law and crime and politics,science and technology,sports,other current events,religion and politics
0,0.97,0.88,0.54,0.57,0.8,0.73


In [60]:
a.explain()

model: BinaryRelevance(classifier=RandomForestClassifier(class_weight='balanced',
                                                  max_depth=100),
                require_dense=[True, True])


TypeError: The passed model is not callable and cannot be analyzed directly with the given masker! Model: BinaryRelevance(classifier=RandomForestClassifier(class_weight='balanced',
                                                  max_depth=100),
                require_dense=[True, True])

In [62]:
import shap

In [98]:
def custom_masker(mask, x): 
    return (x * mask).reshape(1, len(x)) 


In [99]:
explainer = shap.Explainer(a.model.predict_proba, custom_masker, feature_names=a.vectoriser.get_feature_names_out())

In [100]:
a.text_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [101]:
shap_values = explainer(a.text_tfidf)

ValueError: max_evals=500 is too low for the Permutation explainer, it must be at least 2 * num_features + 1 = 64037!