In [1]:
import re
import nltk
import string
import numpy as np
import pandas as pd

from typing import List, Dict
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [2]:
stop_words = set(stopwords.words("english"))

In [3]:
def preprocess_text(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )
    text_tokens = word_tokenize(text)
    text_without_stopwords = [word for word in text_tokens if word not in stop_words]
    return text_without_stopwords

In [4]:
preprocess_text("Hi there, this is a test text !!!")

['hi', 'test', 'text']

In [5]:
texts_filepath = "text_to_cluster.txt"

with open(texts_filepath) as infile:
    data = infile.readlines()
    
texts_df = pd.DataFrame(data={"texts": data}, columns=["texts"])
texts_df.head()

Unnamed: 0,texts
0,Ransomware attack at Mexico's Pemex halts work...
1,#city | #ransomware | Ransomware Attack At Mex...
2,"Mexico's Pemex Oil Suffers Ransomware Attack, ..."
3,A Mexican oil company was hit by ransomware at...
4,Pemex Struck by Ransomware Attack\n


In [6]:
texts_df["cleaned_text"] = texts_df["texts"].apply(lambda x: preprocess_text(x))
texts_df.head()

Unnamed: 0,texts,cleaned_text
0,Ransomware attack at Mexico's Pemex halts work...,"[ransomware, attack, mexicos, pemex, halts, wo..."
1,#city | #ransomware | Ransomware Attack At Mex...,"[city, ransomware, ransomware, attack, mexico,..."
2,"Mexico's Pemex Oil Suffers Ransomware Attack, ...","[mexicos, pemex, oil, suffers, ransomware, att..."
3,A Mexican oil company was hit by ransomware at...,"[mexican, oil, company, hit, ransomware, attack]"
4,Pemex Struck by Ransomware Attack\n,"[pemex, struck, ransomware, attack]"


In [7]:
cleaned_text_tokens = texts_df["cleaned_text"].values.tolist()
cleaned_text_tokens[0]

['ransomware',
 'attack',
 'mexicos',
 'pemex',
 'halts',
 'work',
 'threatens',
 'cripple',
 'computers']

In [8]:
w2v_model = Word2Vec(sentences=cleaned_text_tokens, vector_size=100, workers=2)

In [17]:
type(w2v_model.wv.most_similar("ransomware"))

list

In [23]:
def create_vectors(docs: List[List[str]]) -> List[List[float]]:
    all_vectors = []

    for text_tokens in docs:
        zero_vector = np.zeros(w2v_model.vector_size)
        vectors = []
        for token in text_tokens:
            if token in w2v_model.wv:
                try:
                    vectors.append(w2v_model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vector = vectors.mean(axis=0)
            all_vectors.append(avg_vector)
        else:
            all_vectors.append(zero_vector)
    return all_vectors

In [11]:
all_text_vectors = create_vectors(cleaned_text_tokens)

In [20]:
all_text_vectors[0].ndim

1

In [13]:
len(all_text_vectors[0])

100

In [14]:
kmeans_model = KMeans(n_clusters=6).fit(all_text_vectors)

In [15]:
texts_df["cluster_label"] = kmeans_model.labels_

In [21]:
texts_df

Unnamed: 0,texts,cleaned_text,cluster_label
0,Ransomware attack at Mexico's Pemex halts work...,"[ransomware, attack, mexicos, pemex, halts, wo...",1
1,#city | #ransomware | Ransomware Attack At Mex...,"[city, ransomware, ransomware, attack, mexico,...",1
2,"Mexico's Pemex Oil Suffers Ransomware Attack, ...","[mexicos, pemex, oil, suffers, ransomware, att...",1
3,A Mexican oil company was hit by ransomware at...,"[mexican, oil, company, hit, ransomware, attack]",1
4,Pemex Struck by Ransomware Attack\n,"[pemex, struck, ransomware, attack]",3
...,...,...,...
679,Detecting and Responding to Ransomware\n,"[detecting, responding, ransomware]",3
680,"Emotet malware is back, more dangerous than ev...","[emotet, malware, back, dangerous, ever]",5
681,Hosting provider SmarterASP.NET hit by ransomw...,"[hosting, provider, smarteraspnet, hit, ransom...",1
682,Ransomware as a Service (RaaS) – A Contemporar...,"[ransomware, service, raas, –, contemporary, m...",1


In [16]:
texts_df.head()

Unnamed: 0,texts,cleaned_text,cluster_label
0,Ransomware attack at Mexico's Pemex halts work...,"[ransomware, attack, mexicos, pemex, halts, wo...",1
1,#city | #ransomware | Ransomware Attack At Mex...,"[city, ransomware, ransomware, attack, mexico,...",1
2,"Mexico's Pemex Oil Suffers Ransomware Attack, ...","[mexicos, pemex, oil, suffers, ransomware, att...",1
3,A Mexican oil company was hit by ransomware at...,"[mexican, oil, company, hit, ransomware, attack]",1
4,Pemex Struck by Ransomware Attack\n,"[pemex, struck, ransomware, attack]",3
