In [1]:
import nltk

In [2]:
paragraph = """
Police units around the world have joined forces in a series of covert operations targeting one of West Africa’s most feared criminal networks - Black Axe.
Operation Jackal III saw officers in body armour carry out raids in 21 countries between April and July 2024.
The mission, co-ordinated by global policing agency Interpol, led to the arrest of 300 people with links to Black Axe and other affiliated groups.
Interpol called the operation a “major blow” to the Nigerian crime network, but warned that its international reach and technological sophistication mean it remains a global threat.
In one notorious example, Canadian authorities said they had busted a money-laundering scheme linked to Black Axe worth more than $5bn (£3.8bn) in 2017.
“They are very organised and very structured,” Tomonobu Kaya, a senior official at Interpol’s Financial Crime and Anti-Corruption Centre, told the BBC.
According to a 2022 report by Interpol, “Black Axe and similar groups are responsible for the majority of the world’s cyber-enabled financial fraud as well as many other serious crimes”.
Mr Kaya said innovations in money-transfer software and cryptocurrency have played into the hands of group, which are renowned for multi-million dollar online scams.
“These criminal syndicates are early adopters of new technologies… A lot of fintech developments make it really easy to illegally move money around the world,” he said.
"""

In [3]:
paragraph

'\nPolice units around the world have joined forces in a series of covert operations targeting one of West Africa’s most feared criminal networks - Black Axe.\nOperation Jackal III saw officers in body armour carry out raids in 21 countries between April and July 2024.\nThe mission, co-ordinated by global policing agency Interpol, led to the arrest of 300 people with links to Black Axe and other affiliated groups.\nInterpol called the operation a “major blow” to the Nigerian crime network, but warned that its international reach and technological sophistication mean it remains a global threat.\nIn one notorious example, Canadian authorities said they had busted a money-laundering scheme linked to Black Axe worth more than $5bn (£3.8bn) in 2017.\n“They are very organised and very structured,” Tomonobu Kaya, a senior official at Interpol’s Financial Crime and Anti-Corruption Centre, told the BBC.\nAccording to a 2022 report by Interpol, “Black Axe and similar groups are responsible for t

In [6]:
import re
from nltk.stem import PorterStemmer # Stemming is done using this library
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [9]:
sentences = nltk.sent_tokenize(paragraph)

### Using Stemming

In [10]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [13]:
X

array([[0.        , 0.        , 0.        , ..., 0.26272611, 0.19293796,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.28936967, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.26437191, 0.        , 0.        , ..., 0.        , 0.19414658,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.25191217, 0.        , ..., 0.        , 0.18499653,
        0.        ]])

### Using WordNetLemmatizer

In [14]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [15]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [16]:
X

array([[0.        , 0.        , 0.        , ..., 0.26016514, 0.19105725,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.28266511, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.26176298, 0.        , 0.        , ..., 0.        , 0.19223066,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.24965193, 0.        , ..., 0.        , 0.18333668,
        0.        ]])

In [17]:
cv.vocabulary_

{'police': 82,
 'unit': 110,
 'around': 8,
 'world': 114,
 'joined': 50,
 'force': 39,
 'series': 96,
 'covert': 25,
 'operation': 77,
 'targeting': 103,
 'one': 75,
 'west': 113,
 'africa': 3,
 'feared': 36,
 'criminal': 27,
 'network': 69,
 'black': 13,
 'axe': 11,
 'jackal': 49,
 'iii': 44,
 'saw': 92,
 'officer': 73,
 'body': 16,
 'armour': 7,
 'carry': 20,
 'raid': 84,
 'country': 24,
 'april': 6,
 'july': 51,
 'mission': 64,
 'co': 22,
 'ordinated': 78,
 'global': 41,
 'policing': 83,
 'agency': 4,
 'interpol': 48,
 'led': 54,
 'arrest': 9,
 'people': 80,
 'link': 55,
 'affiliated': 2,
 'group': 42,
 'called': 18,
 'major': 58,
 'blow': 14,
 'nigerian': 71,
 'crime': 26,
 'warned': 111,
 'international': 47,
 'reach': 85,
 'technological': 104,
 'sophistication': 100,
 'mean': 62,
 'remains': 87,
 'threat': 106,
 'notorious': 72,
 'example': 35,
 'canadian': 19,
 'authority': 10,
 'said': 91,
 'busted': 17,
 'money': 65,
 'laundering': 53,
 'scheme': 94,
 'linked': 56,
 'worth': 