## What is FastText?
FastText is a library created by the Facebook Research Team for efficient learning of word representations and sentence classification

In [1]:
import os
import pandas as pd
import numpy as np
import fasttext
import swifter
import re
import string

import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading path 
path1 = r'/home/rahul/bikram sir/text_classification_corpus/cybersecurity_tweets.csv'
path2 = r'/home/rahul/bikram sir/text_classification_corpus/not_cybersecurity_tweets.csv'

In [3]:
csv1 = pd.read_csv(path1)
csv2 = pd.read_csv(path2)

df = pd.concat([csv1,csv2])

In [4]:
df

Unnamed: 0,text,label
0,#AI Robo-Advisers and the Future of Financial ...,1
1,Hackers causing havoc on Mexican banking syste...,1
2,CVE-2019-13127 An issue was discovered in mxGr...,1
3,How to protect your online identity #labourcyb...,1
4,Forensic Acquisition - Shadow Cyber Sec https...,1
...,...,...
199995,Attackers Test Weak Passwords in Purple Fox Ma...,0
199996,Aviso para los usuarios de correo de @AytoMurc...,0
199997,NPM had an obfuscated birthday-like easter egg...,0
199998,CPUの脆弱性「Spectre」に対応してAMDがリリースしたパッチに問題があることをInt...,0


In [5]:
# df.text.tolist()

## Text Preprocessing

In [6]:
class text_preprocess:
    
    def __init__(self):
        pass

    def convert_to_lower(self, text):
        return text.lower()

    
    def remove_emojis(self, text):
        text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
        text = re.sub(r"<.*?>","",text)

        wierd_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emotions
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u'\U00010000-\U0010ffff'
            u"\u200d"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\u3030"
            u"\ufe0f"
            u"\u2069"
            u"\u2066"
            u"\u200c"
            u"\u2068"
            u"\u2067"
            "]+", flags=re.UNICODE)

        rm_emoji = wierd_pattern.sub(r'', text)
        return rm_emoji

    def remove_html(self, text):
        html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        rm_html = re.sub(html, '', text)
        return rm_html

    
    def remove_URL(self,text):
        url = re.compile(r'https?://\S+|www\.\S+')
        URL = url.sub(r'', text)
        return URL

    
    def remove_non_ascii(self, text):
        return re.sub(r'[^\x00-\x7f]',r'', text) # or ''.join([x for x in text if x in string.printable])     
    
    
    def remove_numbers(self, text):
        number_pattern = r'\d+'
        without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
        return without_number


    def remove_punctuation(self,text):
        return text.translate(str.maketrans('', '', string.punctuation))


    def remove_extra_white_spaces(self, text):
        single_char_pattern = re.compile(r'\s+[a-zA-Z]\s+')
        without_sc = re.sub(single_char_pattern, "", text)
#         without_sc = text.replace(' ', '')
        return without_sc

    def unwanted_chrs(self, text):
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
        return (text)


    def preprocessText(self,text):            
        return self.remove_extra_white_spaces(self.unwanted_chrs(self.remove_non_ascii(self.remove_URL(self.remove_html(self.remove_punctuation(self.remove_numbers(self.remove_emojis(self.convert_to_lower(text)))))))))


In [7]:
if __name__ == '__main__':
    text_prprocess_obj = text_preprocess()
    df.text = df.text.swifter.apply(lambda x: text_prprocess_obj.preprocessText(x))

Pandas Apply:   0%|          | 0/400000 [00:00<?, ?it/s]

In [8]:
text_prprocess_obj = text_preprocess()

In [9]:
x = '  IHGFIKHLKJ AFH '
text_prprocess_obj.preprocessText(x)

'  ihgfikhlkj afh '

In [10]:
# df.text.tolist()

## Tokenization

In [11]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

2023-01-07 16:36:24.900641: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-07 16:36:31.563957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-07 16:36:31.564099: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-01-07 16:36:32.839789: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is 

In [12]:
# Tokenizing the tweet base texts.
def tokenize(text):
    my_doc = nlp(text)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    return token_list    

In [13]:
df.text = df.text.swifter.apply(lambda x: tokenize(x))

Pandas Apply:   0%|          | 0/400000 [00:00<?, ?it/s]

In [14]:
# df.text.tolist()

## Remove StopWords

In [15]:
def remove_stopwords(text):    
    filtered_sentence =[] 
    for word in text:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    return " ".join(filtered_sentence)

In [16]:
df.text = df.text.swifter.apply(lambda x: remove_stopwords(x))

Pandas Apply:   0%|          | 0/400000 [00:00<?, ?it/s]

In [17]:
# df.text.tolist()

## Stemmizing
 * porter - PorterStemmer()
     * snowball - LancasterStemmer()
         * Lancster - SnowballStemmer()

In [18]:
from nltk.stem.snowball import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')
  
def stemmizing(text):    
    #stem of each word
    stem_words = []
    for w in text:
        x = snow_stemmer.stem(w)
        stem_words.append(x)
    return "".join(stem_words)

In [19]:
df['text'] = df.text.swifter.apply(lambda x: stemmizing(x))

Pandas Apply:   0%|          | 0/400000 [00:00<?, ?it/s]

In [20]:
df

Unnamed: 0,text,label
0,ai roboadvisers future financial advice data...,1
1,hackers causing havoc mexican banking system c...,1
2,cve issue discovered mxgraph related ...,1
3,protect online identity labourcyberattack cybe...,1
4,forensic acquisition shadow cyber sec cy...,1
...,...,...
199995,attackers test weak passwords purple fox malwa...,0
199996,aviso para los usuarios de correo de recibi...,0
199997,npm obfuscated birthdaylike easter egg mistake...,0
199998,cpuspectreamdintel gigazine intelcpu,0


In [21]:
# df.text.tolist()

## Lemmatization

In [22]:
import nltk
from nltk.stem import WordNetLemmatizer

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

words = set(nltk.corpus.words.words())
# words = nltk.word_tokenize(corpus)

class lemmatization:
    
    def __init__(self):
        pass
    
    def lemmatizing_space(self, text):   
        return " ".join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

    def lemmatizing_words(self, text):
        return " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

    def lemmatize(self, text):
        return self.lemmatizing_space(self.lemmatizing_words(text))


In [23]:
lemmatization_obj = lemmatization()

In [24]:
df['text'] = df.text.swifter.apply(lambda x: lemmatization_obj.lemmatize(x))

Pandas Apply:   0%|          | 0/400000 [00:00<?, ?it/s]

In [25]:
# df.text.tolist()

## Text Features Extraction:
  * Weighted Words - Bag of Words (BoW) - Bag of n-grams:
      * Frequency Vectors - CountVectorizer:
          * Term Frequency-Inverse Document Frequency (TF-IDF):



## Word Embedding
* Global Vectors for Word Representation (GloVe):
  * Word2vec
    * fastText

## fastText

In [26]:
#data split for train and test

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1, random_state=42)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [27]:
#Data prepare for fasttext format for train

train["label_format"]=0
    
for i in train.index:
    train.loc[i, 'label_format'] = "__label__"+str(train.label[i])+" "+str(train.text[i])

In [28]:
# train.label_format.tolist()

In [29]:
#Data prepare for fasttext format for test

test["label_format"]=0
    
for i in test.index:
    test.loc[i, 'label_format'] = "__label__"+str(test.label[i])+" "+str(test.text[i])

In [30]:
import fasttext

In [31]:
train.label_format.to_csv('fasttext_train.txt',index=None,header=None)


In [32]:
test.label_format.to_csv('fasttext_test.txt',index=None,header=None)


In [33]:
#train classifier model
model = fasttext.train_supervised('fasttext_train.txt',epoch=50,lr=0.05,label_prefix='__label__',dim=300)

Read 2M words
Number of words:  21245
Number of labels: 2
Progress: 100.0% words/sec/thread:  561425 lr:  0.000000 avg.loss:  0.440792 ETA:   0h 0m 0s 94.7% words/sec/thread:  558268 lr:  0.002629 avg.loss:  0.441697 ETA:   0h 0m 4s


In [34]:
# model = fasttext.train_unsupervised('fasttext_train.txt',epoch=50,lr=0.05,label_prefix='__label__',dim=300)

In [35]:
#test fasttext model
model.test('fasttext_test.txt')

(40000, 0.790425, 0.790425)

In [36]:
result = model.test('fasttext_train.txt')
validation = model.test('fasttext_test.txt')

In [37]:
# DISPLAY ACCURACY OF TRAINED MODEL
text_line =  "accuracy:" + str(result[1])  +  ",validation:" + str(validation[1]) + '\n' 
print(text_line)

accuracy:0.8044555555555556,validation:0.790425



In [38]:
model.predict('How to protect your online identity #labourcyberattack #CyberSecurity #CyberAttack   https://t.co/vyhQU2mpiR')

(('__label__1',), array([0.76435316]))

In [39]:
model.predict('#Malware #AgentTesla targets #Italy 🇮🇹  "Re Fwd: AVVISO DI PAGAMENTO"  🔥mail.[chinacbn[.biz b.results@[chinacbn.[biz  ⚙️https://t.co/YgXjhTpgyZ  @guelfoweb  @VirITeXplorer @58_158_177_102 @rootella_  #CyberSecurity #infosec #DFIR #cybercrime https://t.co/LXIA9kdn1S',)

(('__label__0',), array([0.5879426]))

In [40]:
model.predict('#Ransomware nuovo attacco #hacker  Colpita #Gigabyte https://t.co/OvXKqfiCVB')

(('__label__0',), array([0.58131564]))

In [41]:
model.predict('One group is responsible for more than 400 #ransomware attacks against U.S. and international organizations https://t.co/8s8sfBbW5t')

(('__label__1',), array([0.78413188]))

In [42]:
model.predict('Radware is hiring! Apply to work at a Glassdoor top 10 Cybersecurity company today! https://t.co/u0i8x0TsY9 #cybersecurity #sales')

(('__label__1',), array([0.75963771]))

In [43]:
model.predict('#Cybersecurity360 #js #cybersecurity  #linux #mac #osx  Update Feeds [skip ci]  670e108 8 minutes ago  mitchellkrogza / Phishing.Database. https://t.co/0Ud2gMqiVj')

(('__label__0',), array([0.8326267]))

In [44]:
model.predict('SMBs Severely Underestimate Data Breach Costs https://t.co/Vq8RsvauUb #SMB #small #business #breach #costs #CyberSecurity #riskmangement')

(('__label__0',), array([0.5879426]))

In [45]:
df

Unnamed: 0,text,label
0,ai future financial advice privacy ai,1
1,causing havoc banking system,1
2,issue discovered related confluence improper i...,1
3,protect identity,1
4,forensic acquisition shadow sec,1
...,...,...
199995,test weak purple fox,0
199996,aviso para de de con,0
199997,easter egg mistaken think,0
199998,,0


## Advanced Word Embedding Methods - Deep Contextualized Word Representations:¶

    # * Bidirectional Encoder Representations from Transformers (BERT):¶
