In [16]:
#!/usr/bin/env python3
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import hamming_loss, accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 
import numpy as np
import sklearn
from sklearn import metrics 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# This module refers to k-nearest neighbors for multi-label
from skmultilearn.adapt import MLkNN

## the ARAAM model is a neural network for large scale text classification 
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier 

from sklearn.metrics import roc_auc_score
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.svm import SVC
from nltk.tokenize import word_tokenize
from scipy.sparse import csr_matrix 
import re 
import joblib 
import warnings

In [2]:
# importing the model 

model = joblib.load('news_model.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
## Loading the dataset
df = pd.read_csv("scraped_news.csv")
# Cleaning the dataset 

preprocessed_data = joblib.load('preprocessed_data.pkl')
preprocessed_data.head()


In [17]:
# builidng the model 

def build_model(data): 
    """
    This will be a multi-label classification model 
    to classify news reports.
    
    For this, I will use binary relevance as it is a very simple technique
    and very interpretable.
    
    """
    vectoriser = TfidfVectorizer(stop_words=list(stop_words))
    
    # vectorising the text 
    topics = ['Armed conflicts and attacks', 'Arts and culture',
       'Business and economy', 'Disasters and accidents',
       'Health and environment', 'International relations',
       'Law and crime', 'Politics and elections',
       'Science and technology', 'Sports', 'Other', 'Royalty',
       'Politics and economics', 'Entertainment']
    
    
    # I need to make sure I drop all of the columns 
    print ("Forming the X and Y...")
    X = data.text
    print ("#--------------------")
    
    print ("#--------------------")
    
    
    print (f"Shape of X: {np.shape(X)}")
    
    print ("Creating the target variable...")
    y = data[[*topics]].values
    y = csr_matrix(y, dtype=np.int64)
    
    # ------ converting y into a matrix 
    
    
    
    # ------ the vectorizer should be fit on the training dataset first 
    
    
    
    
    print ("Splitting the dataset...")
    
    # ------ Make sure there is a consistent number of samples 
    # ------ The vectoriser should be fit within the clean dataframe
    
    # ----- Found input variables with inconsistent numbers of samples: [1001, 14014]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    
    # ------ Vectorising the training and test dataset 
    
    
    print ("Vectorising...")
    # ------ should only vectorise the training as it prevents data leakge
    X_train_tfidf = vectoriser.fit_transform(X_train) 
    X_test_tfidf = vectoriser.transform(X_test) 
    
    
    
    
    
    
    
    
    print ("Creating the Classifier...")
    # ------ scipy.sparse does not support dtype object 
    
    classifier = BinaryRelevance(DecisionTreeClassifier())

    # train
    print ("Fitting the Classifier...")
    
    # ------ Make sure that datatypes of the training and testing are right
    # ------ Check datatypes of X_train and Y_train 
    # ------ make sure the y_train is in int format 
    
    # ------ why is the X train shape (1,1)?
    
    
    print (f"Shape of X_train_tfidf: {np.shape(X_train_tfidf)}")
    print (X_train_tfidf)
    print ("-------------------------")
    
    print (f"Shape of Y_train: {np.shape(Y_train)}")
    classifier.fit(X_train_tfidf, Y_train)

    #    predict
    print ("Test predictions:") 
    
    predictions = classifier.predict(X_test_tfidf) 
    print (f"Shape of the predictions: {np.shape(predictions)}")
    print (f"predictions...: {predictions}")
    print (f"Accuracy: {accuracy_score(predictions, Y_test)}")
    print (f"Weighted f1 score: {f1_score(predictions, Y_test, average="weighted")}")
    print (f"Weighted Precision: {precision_score(predictions, Y_test, average='weighted')}")
    print (f"Weighted Recall: {recall_score(predictions, Y_test, average='weighted')}")
    
    
    # ----- PLOTTING THE CONFUSION MATRIX 
    print ("Completed training model")
    
    
    
    
    return classifier, predictions, Y_test, X_test_tfidf


model, predictions, Y_test, X_test_tfidf = build_model(preprocessed_data)

Forming the X and Y...
#--------------------
#--------------------
Shape of X: (1001,)
Creating the target variable...
Splitting the dataset...
Vectorising...
Creating the Classifier...
Fitting the Classifier...
Shape of X_train_tfidf: (800, 24037)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 219443 stored elements and shape (800, 24037)>
  Coords	Values
  (0, 18908)	0.031158322915683857
  (0, 22286)	0.07489960377912414
  (0, 23133)	0.03413466640297218
  (0, 362)	0.1316465909352914
  (0, 18905)	0.15341058543851022
  (0, 11153)	0.04275950990055421
  (0, 22285)	0.11955229987677626
  (0, 12457)	0.11528208927654543
  (0, 15627)	0.03341852991610167
  (0, 3304)	0.036934125328452305
  (0, 8779)	0.02156564437336592
  (0, 19250)	0.03888723429780967
  (0, 18444)	0.07963143027594767
  (0, 13445)	0.09163940620386554
  (0, 20797)	0.07724210908239554
  (0, 15060)	0.025170348725956308
  (0, 3339)	0.05133504526033659
  (0, 15173)	0.08528442179528824
  (0, 19902)	0.048447514839455516
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Cross-Validation

In [18]:
# performing cross validation 
# Bear in mind, I need to vectorise the X variable 

k_folds = KFold(n_splits = 5)

scores = cross_val_score(model, X, y, cv = k_folds)



NameError: name 'X' is not defined

# Plotting the metrics

In [19]:
# plotting the confusion matrix 

from sklearn.metrics import ConfusionMatrixDisplay

In [21]:
model.classes_

AttributeError: 'BinaryRelevance' object has no attribute 'classes_'

In [None]:
predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=model.classes_)