In [4]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

In [18]:
paragraph = """
Police units around the world have joined forces in a series of covert operations targeting one of West Africa’s most feared criminal networks - Black Axe.
Operation Jackal III saw officers in body armour carry out raids in 21 countries between April and July 2024.
The mission, co-ordinated by global policing agency Interpol, led to the arrest of 300 people with links to Black Axe and other affiliated groups.
Interpol called the operation a “major blow” to the Nigerian crime network, but warned that its international reach and technological sophistication mean it remains a global threat.
In one notorious example, Canadian authorities said they had busted a money-laundering scheme linked to Black Axe worth more than $5bn (£3.8bn) in 2017.
“They are very organised and very structured,” Tomonobu Kaya, a senior official at Interpol’s Financial Crime and Anti-Corruption Centre, told the BBC.
According to a 2022 report by Interpol, “Black Axe and similar groups are responsible for the majority of the world’s cyber-enabled financial fraud as well as many other serious crimes”.
Mr Kaya said innovations in money-transfer software and cryptocurrency have played into the hands of group, which are renowned for multi-million dollar online scams.
“These criminal syndicates are early adopters of new technologies… A lot of fintech developments make it really easy to illegally move money around the world,” he said.
"""

In [19]:
paragraph

'\nPolice units around the world have joined forces in a series of covert operations targeting one of West Africa’s most feared criminal networks - Black Axe.\nOperation Jackal III saw officers in body armour carry out raids in 21 countries between April and July 2024.\nThe mission, co-ordinated by global policing agency Interpol, led to the arrest of 300 people with links to Black Axe and other affiliated groups.\nInterpol called the operation a “major blow” to the Nigerian crime network, but warned that its international reach and technological sophistication mean it remains a global threat.\nIn one notorious example, Canadian authorities said they had busted a money-laundering scheme linked to Black Axe worth more than $5bn (£3.8bn) in 2017.\n“They are very organised and very structured,” Tomonobu Kaya, a senior official at Interpol’s Financial Crime and Anti-Corruption Centre, told the BBC.\nAccording to a 2022 report by Interpol, “Black Axe and similar groups are responsible for t

## Cleaning the texts - using stemming

In [20]:
sentences = nltk.sent_tokenize(paragraph)

In [21]:
sentences

['\nPolice units around the world have joined forces in a series of covert operations targeting one of West Africa’s most feared criminal networks - Black Axe.',
 'Operation Jackal III saw officers in body armour carry out raids in 21 countries between April and July 2024.',
 'The mission, co-ordinated by global policing agency Interpol, led to the arrest of 300 people with links to Black Axe and other affiliated groups.',
 'Interpol called the operation a “major blow” to the Nigerian crime network, but warned that its international reach and technological sophistication mean it remains a global threat.',
 'In one notorious example, Canadian authorities said they had busted a money-laundering scheme linked to Black Axe worth more than $5bn (£3.8bn) in 2017.',
 '“They are very organised and very structured,” Tomonobu Kaya, a senior official at Interpol’s Financial Crime and Anti-Corruption Centre, told the BBC.',
 'According to a 2022 report by Interpol, “Black Axe and similar groups ar

In [22]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [23]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [24]:
corpus

['polic unit around world join forc seri covert oper target one west africa fear crimin network black axe',
 'oper jackal iii saw offic bodi armour carri raid countri april juli',
 'mission co ordin global polic agenc interpol led arrest peopl link black axe affili group',
 'interpol call oper major blow nigerian crime network warn intern reach technolog sophist mean remain global threat',
 'one notori exampl canadian author said bust money launder scheme link black axe worth bn bn',
 'organis structur tomonobu kaya senior offici interpol financi crime anti corrupt centr told bbc',
 'accord report interpol black axe similar group respons major world cyber enabl financi fraud well mani seriou crime',
 'mr kaya said innov money transfer softwar cryptocurr play hand group renown multi million dollar onlin scam',
 'crimin syndic earli adopt new technolog lot fintech develop make realli easi illeg move money around world said']

## Cleaning text using lemmatization

In [25]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [26]:
corpus

['police unit around world joined force series covert operation targeting one west africa feared criminal network black axe',
 'operation jackal iii saw officer body armour carry raid country april july',
 'mission co ordinated global policing agency interpol led arrest people link black axe affiliated group',
 'interpol called operation major blow nigerian crime network warned international reach technological sophistication mean remains global threat',
 'one notorious example canadian authority said busted money laundering scheme linked black axe worth bn bn',
 'organised structured tomonobu kaya senior official interpol financial crime anti corruption centre told bbc',
 'according report interpol black axe similar group responsible majority world cyber enabled financial fraud well many serious crime',
 'mr kaya said innovation money transfer software cryptocurrency played hand group renowned multi million dollar online scam',
 'criminal syndicate early adopter new technology lot fin

## Bag of words models

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
cv = CountVectorizer() # Binary is optional

In [29]:
X = cv.fit_transform(corpus).toarray()

In [30]:
X

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 1, 0]], dtype=int64)

In [31]:
cv.vocabulary_  ## Represents the index of the vectorizer

{'police': 82,
 'unit': 110,
 'around': 8,
 'world': 114,
 'joined': 50,
 'force': 39,
 'series': 96,
 'covert': 25,
 'operation': 77,
 'targeting': 103,
 'one': 75,
 'west': 113,
 'africa': 3,
 'feared': 36,
 'criminal': 27,
 'network': 69,
 'black': 13,
 'axe': 11,
 'jackal': 49,
 'iii': 44,
 'saw': 92,
 'officer': 73,
 'body': 16,
 'armour': 7,
 'carry': 20,
 'raid': 84,
 'country': 24,
 'april': 6,
 'july': 51,
 'mission': 64,
 'co': 22,
 'ordinated': 78,
 'global': 41,
 'policing': 83,
 'agency': 4,
 'interpol': 48,
 'led': 54,
 'arrest': 9,
 'people': 80,
 'link': 55,
 'affiliated': 2,
 'group': 42,
 'called': 18,
 'major': 58,
 'blow': 14,
 'nigerian': 71,
 'crime': 26,
 'warned': 111,
 'international': 47,
 'reach': 85,
 'technological': 104,
 'sophistication': 100,
 'mean': 62,
 'remains': 87,
 'threat': 106,
 'notorious': 72,
 'example': 35,
 'canadian': 19,
 'authority': 10,
 'said': 91,
 'busted': 17,
 'money': 65,
 'laundering': 53,
 'scheme': 94,
 'linked': 56,
 'worth': 

In [32]:
X[0]

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0], dtype=int64)