# NLP Walkthrough
An example of NLP, using the BBC news data ([which is available here](http://mlg.ucd.ie/datasets/bbc.html)).

In [19]:
# Load Libraries
import pandas as pd
import re
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import os

In [2]:
# Load the files
parent_folder = "C:\\Users\\User\\Documents\\Projects\\BBC_News_NLP\\Data\\bbc"

category_folders = []
for folder in os.listdir(parent_folder):
    path = os.path.join(parent_folder, folder)

    if os.path.isdir(path):
        category_folders.append(path)

articles_list = []
for category in category_folders:
    for article in os.listdir(category):
        if article.endswith(".txt") and not article.startswith("README"):
            article_path = os.path.join(category, article)
            try: 
                with open(article_path, "r") as file:
                    article_text = file.read()
                    articles_list.append({"text" : article_text, "category" : os.path.relpath(category, parent_folder)})
            except: 
                print(f"Could not load {article_path}")

articles = pd.DataFrame(articles_list)

print(f"{len(articles)} articles loaded")
articles.head()

2225 articles loaded


Unnamed: 0,text,category
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


### Process the files for analysis
Using tokenisation, lemmatising, stemming, and by removing stopwords.


In [5]:
#sentence tokenisation:
first_article = articles["text"][0]

first_article_sentences = sent_tokenize(articles["text"][0])
print("Sentence tokenisation:", first_article_sentences[:3], sep="\n")

print("\nWord tokenisation:")
# word tokenisation
for sentence in first_article_sentences[:3]:
    print(word_tokenize(sentence), "\n")

first_sentence = first_article_sentences[0]

Sentence tokenisation:
['Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.', 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.', 'TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.']

Word tokenisation:
['Ad', 'sales', 'boost', 'Time', 'Warner', 'profit', 'Quarterly', 'profits', 'at', 'US', 'media', 'giant', 'TimeWarner', 'jumped', '76', '%', 'to', '$', '1.13bn', '(', 'Â£600m', ')', 'for', 'the', 'three', 'months', 'to', 'December', ',', 'from', '$', '639m', 'year-earlier', '.'] 

['The', 'firm', ',', 'which', 'is', 'now', 'one', 'of', 'the', 'biggest', 'investors', 'in', 'Google', ',', 'benefited', 'from', 'sales', 'of', 'high-speed', 'internet', 'connections', 'and', 'higher', 'advert', 'sales', '.'] 

['TimeWarner', 'said', 'fourth', 'quarter', 'sal

In [7]:
# Remove stopwords
stopwords_list = stopwords.words('english')
print(stopwords_list)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [9]:
print("Stopwords: ", [word for word in word_tokenize(first_sentence) if word.lower() in stopwords_list])
print("Cleaned sentence: ", [word for word in word_tokenize(first_sentence) if word.lower() not in stopwords_list])

Stopwords:  ['at', 'to', 'for', 'the', 'to', 'from']
Cleaned sentence:  ['Ad', 'sales', 'boost', 'Time', 'Warner', 'profit', 'Quarterly', 'profits', 'US', 'media', 'giant', 'TimeWarner', 'jumped', '76', '%', '$', '1.13bn', '(', 'Â£600m', ')', 'three', 'months', 'December', ',', '$', '639m', 'year-earlier', '.']


In [11]:
# Can also remove punctuation by adding to our stopwords list:
from string import punctuation
print(list(punctuation))
stopwords_list = [*stopwords_list, *list(punctuation), *['``', "''"]]

print("\nStopwords: ", [word for word in word_tokenize(first_sentence) if word.lower() in stopwords_list])
print("Cleaned sentence: ", [word for word in word_tokenize(first_sentence) if word.lower() not in stopwords_list])

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

Stopwords:  ['at', '%', 'to', '$', '(', ')', 'for', 'the', 'to', ',', 'from', '$', '.']
Cleaned sentence:  ['Ad', 'sales', 'boost', 'Time', 'Warner', 'profit', 'Quarterly', 'profits', 'US', 'media', 'giant', 'TimeWarner', 'jumped', '76', '1.13bn', 'Â£600m', 'three', 'months', 'December', '639m', 'year-earlier']


Stemming and Lemmatisation both seek to reduce words to simpler versions, so similar words can be grouped.

Stemming works by literally reducing the word
* e.g. it might reduce "jumping", "jumps" and "jumper" to just "jump".
* However, in this case, "jumper" is more likely to refer to a type of clothing!

Instead, lemmatisation uses grammar rules and dictionary look ups to attempt reduce the words via their meaning.
* e.g. it might reduce "jumping" and "jumps" to "jump", and keep "jumper" as "jumper" - especially if it 'knows' it's a noun!

Their are many pre-defined stemming and lemmitasing algorithms. Two of the post common are *Porter Stemmer* and *Wordnet Lemmatizer*
 

In [13]:
from nltk.stem import PorterStemmer
print("Stemming (Porter Stemmer):")
for word in ["dancing", "danced", "dance", "dancer", "dances"]:
    print(PorterStemmer().stem(word))

Stemming (Porter Stemmer):
danc
danc
danc
dancer
danc


In [15]:
from nltk.stem import WordNetLemmatizer as WNL
print("Lemmatisation (Wordnet Lemmatizer):")
for word in ["dancing", "danced", "dance", "dancer", "dances"]:
    print(WNL().lemmatize(word))

Lemmatisation (Wordnet Lemmatizer):
dancing
danced
dance
dancer
dance


Accurate lemmatisation requires the algorithm to be able to identify grammatical patterns. This process can be improved using Part-of-Speech (POS) tagging, which is when words in the text are labelled, for example, labelling words as adjectives or adverbs.

POS tagging can also be done by an algorithm, although different algorithms tag differently. The most common is the Penn Treebank algorithm, which uses labels such as NN (for a noun), RB (adverb), VB (verb), VBD (past-tense verb), etc.

However for WordNet Lemmatisation (WNL), we only need to label ADJ (adjectives), VERB (verbs), NOUN (nouns), ADV (adverbs).

We can therefore use Penn Treebank POS-tagging, but simplify to the tags used by WNL:

In [21]:
from nltk import pos_tag

# Assign Penn Treebank tags:
print("Penn Treebank POS-tags:\n", pos_tag(word_tokenize(first_sentence)))

# Define a function to convert Penn Treebank POS to WNL tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

WN_pos_tags = []
for word, tag in pos_tag(word_tokenize(first_sentence)):
    WN_tag = get_wordnet_pos(tag)
    WN_pos_tags.append((word, WN_tag))

print("WNL POS-tags:\n", WN_pos_tags)


Penn Treebank POS-tags:
 [('Ad', 'NN'), ('sales', 'NNS'), ('boost', 'VBP'), ('Time', 'NNP'), ('Warner', 'NNP'), ('profit', 'VB'), ('Quarterly', 'JJ'), ('profits', 'NNS'), ('at', 'IN'), ('US', 'NNP'), ('media', 'NNS'), ('giant', 'JJ'), ('TimeWarner', 'NNP'), ('jumped', 'VBD'), ('76', 'CD'), ('%', 'NN'), ('to', 'TO'), ('$', '$'), ('1.13bn', 'CD'), ('(', '('), ('Â£600m', 'NN'), (')', ')'), ('for', 'IN'), ('the', 'DT'), ('three', 'CD'), ('months', 'NNS'), ('to', 'TO'), ('December', 'NNP'), (',', ','), ('from', 'IN'), ('$', '$'), ('639m', 'CD'), ('year-earlier', 'JJ'), ('.', '.')]
WNL POS-tags:
 [('Ad', 'n'), ('sales', 'n'), ('boost', 'v'), ('Time', 'n'), ('Warner', 'n'), ('profit', 'v'), ('Quarterly', 'a'), ('profits', 'n'), ('at', 'n'), ('US', 'n'), ('media', 'n'), ('giant', 'a'), ('TimeWarner', 'n'), ('jumped', 'v'), ('76', 'n'), ('%', 'n'), ('to', 'n'), ('$', 'n'), ('1.13bn', 'n'), ('(', 'n'), ('Â£600m', 'n'), (')', 'n'), ('for', 'n'), ('the', 'n'), ('three', 'n'), ('months', 'n'), ('to

We can now perform lemmatisation:

In [24]:
# Original
print("Original: ", " ".join(word_tokenize(first_sentence)))

#Without POS tagging:
print("\nDefault WNL: ", " ".join([WNL().lemmatize(word.lower()) for word in word_tokenize(first_sentence)]))

# With POS tagging:
print("\nPOS-tagged WNL: ", " ".join([WNL().lemmatize(word.lower(), pos=tag) for word, tag in WN_pos_tags]))

Original:  Ad sales boost Time Warner profit Quarterly profits at US media giant TimeWarner jumped 76 % to $ 1.13bn ( Â£600m ) for the three months to December , from $ 639m year-earlier .

Default WNL:  ad sale boost time warner profit quarterly profit at u medium giant timewarner jumped 76 % to $ 1.13bn ( â£600m ) for the three month to december , from $ 639m year-earlier .

POS-tagged WNL:  ad sale boost time warner profit quarterly profit at u medium giant timewarner jump 76 % to $ 1.13bn ( â£600m ) for the three month to december , from $ 639m year-earlier .


In [26]:
# Convert this into a function to take a sentence at a time:
def WNL_sentence(input_text):
    tagged_text = []
    for word, tag in pos_tag(word_tokenize(input_text)):
        tagged_text.append((word, get_wordnet_pos(tag)))
        
    output_text = [WNL().lemmatize(word.lower(), pos=tag) for word, tag in tagged_text]
    return(output_text)

print(WNL_sentence(first_article_sentences[2]))

['timewarner', 'say', 'fourth', 'quarter', 'sale', 'rise', '2', '%', 'to', '$', '11.1bn', 'from', '$', '10.9bn', '.']


In [28]:
# We can combine all this into one pre-processing function:
def preprocess_sentence(input_text):
    tagged_text = []
    for word, tag in pos_tag(word_tokenize(input_text)):
        tagged_text.append((word, get_wordnet_pos(tag)))
        
    lemma_text = [WNL().lemmatize(word.lower(), pos=tag) for word, tag in tagged_text]
    processed_text = [word for word in lemma_text if word not in stopwords_list]
    return processed_text

preprocess_sentence("Today I am performing some text analysis on BBC News articles, which will be fun!")
    

['today', 'perform', 'text', 'analysis', 'bbc', 'news', 'article', 'fun']

### Perform some basic analysis

In [30]:
# Process some sentences from our articles:
test_article = articles["text"][68]
test_sentence = sent_tokenize(test_article)[10]
processed_sentence = preprocess_sentence(test_sentence)
print("test_sentence: ", test_sentence)
print("\nProcessed test_sentence: ", processed_sentence)

test_sentence:  "Boeing has the latest variant in a very successful line of airplanes and there is no doubt it will continue to be very successful," said David Learmount, operations and safety editor at industry magazine Flight International.

Processed test_sentence:  ['boeing', 'late', 'variant', 'successful', 'line', 'airplane', 'doubt', 'continue', 'successful', 'say', 'david', 'learmount', 'operation', 'safety', 'editor', 'industry', 'magazine', 'flight', 'international']


In [32]:
from collections import Counter

# We can count the occurence of words (into a "Bag-of-words"):
print(Counter(processed_sentence))

Counter({'successful': 2, 'boeing': 1, 'late': 1, 'variant': 1, 'line': 1, 'airplane': 1, 'doubt': 1, 'continue': 1, 'say': 1, 'david': 1, 'learmount': 1, 'operation': 1, 'safety': 1, 'editor': 1, 'industry': 1, 'magazine': 1, 'flight': 1, 'international': 1})


In [34]:
# And can convert these into a vector:
counts_series = pd.Series(Counter(processed_sentence)).to_frame().T
counts_series

Unnamed: 0,boeing,late,variant,successful,line,airplane,doubt,continue,say,david,learmount,operation,safety,editor,industry,magazine,flight,international
0,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1


We can also use scikit-learn to vectorise our our test article (note that CountVectorizer requires a text file or a list of strings!):

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

Count_Vector = CountVectorizer() # initiate default vectoriser
Count_Vector.fit_transform([test_sentence])

print(Count_Vector.vocabulary_) # NB: the numbers are indices, not counts!

Also note that minimal processing has been done on this string; stop words haven't been removed and the string hasn't been lemmatised.

We can pass our custom preprocessing function in the initial settings:

In [None]:
Count_Vector2 = CountVectorizer(analyzer = preprocess_sentence)
vectorised = Count_Vector2.fit_transform([test_sentence, test_sentence2])
print(Count_Vector2.vocabulary_)

In [None]:
# We can print these in index order:
print(Count_Vector2.get_feature_names_out())

In [None]:
# And convert the bag-of-words to a count vector: (one row for each list item)
print(vectorised.toarray())

In [None]:
# NB we can perform this over a whole article using the below:
from sklearn.feature_extraction.text import CountVectorizer

print(article_path) # from the 2nd cell of this notebook

with open(article_path, "r") as file:
    vectoriser = CountVectorizer(analyzer = preprocess_sentence)
    vectorised_article = vectoriser.fit_transform(file)
print(vectoriser.get_feature_names_out()[:20])
print(vectorised_article.toarray())

### Advanced Analysis
We can also use ML techniques to analyse our data in more detail!
For instance, we can also develop a Naive Bayes that can predict the subject of our articles:

*Note we could also tune alpha, but for this example we will keep it simple*

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import random
random.seed(10)

In [None]:
articles.head()

In [None]:
# Split into test and training data
X = articles["text"]
Y = articles["category"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

# Setup pipeline:pProcess and vectorise, then classify using multinomial Naive Bayes:
NB_model = make_pipeline(CountVectorizer(analyzer = preprocess_sentence), MultinomialNB())

# Train the model:
NB_model.fit(X_train, Y_train)

In [None]:
# Evaluate performance

Y_predictions = NB_model.predict(X_test)

print(accuracy_score(Y_test, Y_predictions))
print(classification_report(Y_test, Y_predictions))

The model had an accuracy of **96.41%**! We can also visualise these results (and other useful data):

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Visualise confusion matrix as a heatmap:
conf_mat = confusion_matrix(Y_test, Y_predictions, labels=NB_model.classes_)

plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=False, xticklabels=NB_model.classes_, yticklabels=NB_model.classes_, cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Visualise most important words per category

import numpy as np

vectorizer = NB_model.named_steps['countvectorizer']
classifier = NB_model.named_steps['multinomialnb']

feature_names = vectorizer.get_feature_names_out()
class_labels = classifier.classes_

top_n = 10  

# Create subplots: one for each class
fig, axes = plt.subplots(len(class_labels), 1, figsize=(10, 5 * len(class_labels)))

for i, class_label in enumerate(class_labels):
    # Get top feature indices for this class
    class_feature_log_probs = classifier.feature_log_prob_[i]
    top_indices = np.argsort(class_feature_log_probs)[-top_n:]
    top_words = feature_names[top_indices]
    top_probs = class_feature_log_probs[top_indices]
    
    # Convert log probs to actual probabilities
    top_probs_exp = np.exp(top_probs)

    # Plot
    sns.barplot(x=top_probs_exp, y=top_words, ax=axes[i], palette="Blues_d")
    axes[i].set_title(f"Top {top_n} Words for Category: {class_label}")
    axes[i].set_xlabel("P(word | category)")
    axes[i].set_ylabel("Word")

plt.tight_layout()
plt.show()


In [None]:
results_df = pd.DataFrame({
    'text': X_test,
    'actual': Y_test,
    'predicted': Y_predictions
})

# Filter misclassified samples
misclassified = results_df[results_df['actual'] != results_df['predicted']]

print(misclassified.sample(10)[['text', 'actual', 'predicted']])


In [None]:
# Set plot size and style
plt.figure(figsize=(10, 6))
sns.countplot(data=articles, y='category', order=articles['category'].value_counts().index, palette='viridis')

plt.title("Number of Articles per Category (Whole dataset)")
plt.xlabel("Count")
plt.ylabel("Category")
plt.tight_layout()
plt.show()

# Summarising
We can also summarise articles, reducing them to shorter formats. Note that special characters need removing for this, but other processing (e.g. lemmatisation) is not required.

There are different forms of summarisation, the most common being Extractive, TextRank, and Abstractive.

**Extractive** does not generate new text, and simply select and combines existing sentences based on rules. Sentence importance is estimated based on rules, such as the presence of keywords or named entities, position in the text (early and later sentences are more important), and the freqency of non-stop words. It preserves the original content of the selected sentences best, and is therefore less prone to hallucinating/creating errors. It is also quick and inexpensive.

**TextRank** is a form of extractive summarisation, where connections are graphed between similar sentences, with highly-connected sentences being selected as most important. It also doesn't create new text, and can be sophisticated than tradional extration. However, it is more costly to run.

**Abstractive** summarisation aims to "understand" the original text and rephrase by creating new sentences. It uses deep learning (seq2seq) models. Text is encoded, and then decoded into a summary. It offers greater level of reduction than extractive methods, however it is much more computationally expensive, and as it is generative it can hallucinate and create errors.

In [60]:
# a reminder of our first_article:
first_article[:200]

# remove special characters:
text = first_article.replace('\n', ' ').replace('\r', ' ').strip()

In [52]:
#Extractive (Traditional) Summarisation:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Split into sentences
sentences = nltk.sent_tokenize(text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)

# Cosine similarity matrix
similarity_matrix = cosine_similarity(X)

# Rank sentences by their "importance" (sum of similarities)
sentence_scores = similarity_matrix.sum(axis=1)

# Select top N sentences
top_n = 3
top_sentence_indices = sentence_scores.argsort()[-top_n:][::-1]
top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

# Print results
print("Original Article:\n", text[:500], "\n...")
print("\nTraditional Extractive Summary:")
for sent in top_sentences:
    print("-", sent)

Original Article:
 Ad sales boost Time Warner profit  Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time  
...

Traditional Extractive Summary:
- Ad sales boost Time Warner profit  Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.
- TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.
- However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.


In [54]:
# TextRank Summarisation:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Set up parser and summarizer
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()

# Generate summary with N sentences
summary = summarizer(parser.document, sentences_count=3)

# Print result
print("TextRank Summary:")
for sentence in summary:
    print("-", sentence)

TextRank Summary:
- It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband.
- But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results.
- It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue.


In [62]:
# Abstractive Summarisation:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Tokenize and encode the input text
inputs = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary (you can tweak length and decoding strategy)
summary_ids = model.generate(
    inputs,
    max_length=130,
    min_length=30,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("BART Summary:\n", summary)

BART Summary:
 TimeWarner profits up 76% to $1.13bn for the three months to December. Firm is now one of the biggest investors in Google.
