In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import swifter

In [2]:
cyber = pd.read_csv("cybersecurity_tweets.csv")
not_cyber = pd.read_csv("not_cybersecurity_tweets.csv")

In [3]:
# Use pandas.concat() method to ignore_index 
df = pd.concat([cyber, not_cyber], ignore_index=True, axis = 0)

In [4]:
df.head()

Unnamed: 0,text,label
0,#AI Robo-Advisers and the Future of Financial ...,1
1,Hackers causing havoc on Mexican banking syste...,1
2,CVE-2019-13127 An issue was discovered in mxGr...,1
3,How to protect your online identity #labourcyb...,1
4,Forensic Acquisition - Shadow Cyber Sec https...,1


In [5]:
df.shape

(400000, 2)

In [6]:
df.label.value_counts()

1    200000
0    200000
Name: label, dtype: int64

In [7]:
# Drop duplicates
df.drop_duplicates(keep='first',inplace= True)

In [8]:
STOPWORDS=set(stopwords.words('english'))
lemmatizer= WordNetLemmatizer()

def tokenize(text):
    
   #1. Remove links
    text = re.sub(r"http\S+", "", text)

   #2. Normalize the data by converting to lower case and removing punctuations
    text = re.sub("[^a-zA-Z]", " ", text.lower())
    
   #3. Tokenizing: split text into words
    tokens = word_tokenize(text)
    
   #4. Remove stop words
    words = [w for w in tokens if w not in STOPWORDS]
    
    #5. Lemmatize 
    lemmed_words = [lemmatizer.lemmatize(w) for w in words]
    
    clean_tokens = []
    
    for i in lemmed_words:
        clean_tokens.append(i)
        
        ## back to string from list
    text = " ".join(clean_tokens)
    return text

In [9]:
df['Cleantext'] = df['text'].swifter.apply(lambda x: tokenize(x))

Pandas Apply:   0%|          | 0/390162 [00:00<?, ?it/s]

In [10]:
df.head()

Unnamed: 0,text,label,Cleantext
0,#AI Robo-Advisers and the Future of Financial ...,1,ai robo adviser future financial advice datapr...
1,Hackers causing havoc on Mexican banking syste...,1,hacker causing havoc mexican banking system cy...
2,CVE-2019-13127 An issue was discovered in mxGr...,1,cve issue discovered mxgraph related diagram p...
3,How to protect your online identity #labourcyb...,1,protect online identity labourcyberattack cybe...
4,Forensic Acquisition - Shadow Cyber Sec https...,1,forensic acquisition shadow cyber sec cybersec...


In [11]:
df['label'] = "__label__" + df['label'].astype(str)
df.head()

Unnamed: 0,text,label,Cleantext
0,#AI Robo-Advisers and the Future of Financial ...,__label__1,ai robo adviser future financial advice datapr...
1,Hackers causing havoc on Mexican banking syste...,__label__1,hacker causing havoc mexican banking system cy...
2,CVE-2019-13127 An issue was discovered in mxGr...,__label__1,cve issue discovered mxgraph related diagram p...
3,How to protect your online identity #labourcyb...,__label__1,protect online identity labourcyberattack cybe...
4,Forensic Acquisition - Shadow Cyber Sec https...,__label__1,forensic acquisition shadow cyber sec cybersec...


In [12]:
df['text_description'] = df['label'] + " " + df['Cleantext']
df.head()

Unnamed: 0,text,label,Cleantext,text_description
0,#AI Robo-Advisers and the Future of Financial ...,__label__1,ai robo adviser future financial advice datapr...,__label__1 ai robo adviser future financial ad...
1,Hackers causing havoc on Mexican banking syste...,__label__1,hacker causing havoc mexican banking system cy...,__label__1 hacker causing havoc mexican bankin...
2,CVE-2019-13127 An issue was discovered in mxGr...,__label__1,cve issue discovered mxgraph related diagram p...,__label__1 cve issue discovered mxgraph relate...
3,How to protect your online identity #labourcyb...,__label__1,protect online identity labourcyberattack cybe...,__label__1 protect online identity labourcyber...
4,Forensic Acquisition - Shadow Cyber Sec https...,__label__1,forensic acquisition shadow cyber sec cybersec...,__label__1 forensic acquisition shadow cyber s...


# Split the dataset into Training and Testing datasets

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train, test = train_test_split(df, test_size=0.3)

In [15]:
train.shape , test.shape

((273113, 4), (117049, 4))

In [16]:
train.to_csv("tweet.train", columns=["text_description"], index=False, header=False)
test.to_csv("tweet.test", columns=["text_description"], index=False, header=False)

## Train the model and evaluate performance

In [17]:
import fasttext
model = fasttext.train_supervised(input="tweet.train")
model.test("tweet.test")

Read 4M words
Number of words:  184525
Number of labels: 2
Progress: 100.0% words/sec/thread: 1984170 lr:  0.000000 avg.loss:  0.030807 ETA:   0h 0m 0s


(117049, 0.9902690326273612, 0.9902690326273612)

First parameter (117049) is test size. Second and third parameters are precision and recall respectively. You can see we are getting around 99% precision which is pretty good

## Now let's do prediction for few text_descriptions

In [18]:
model.predict("cuidado el phishing suplantaci n de identidad fue uno de los delitos en internet reportados durante el te contamos de qu va malware ciberseguridad seguridadinformatica marketcrosslatam")

(('__label__0',), array([0.99743682]))

In [19]:
model.predict("popular pirated software used lure serve malware dropper cybersecurity via questechie")

(('__label__1',), array([1.00001001]))

In [20]:
model.predict("cybersecurity nigerian company mercy cybercriminals pay ransomware attack report say")

(('__label__1',), array([1.00001001]))

In [32]:
model.predict("databreach robotics machinelearning augmentedreality fintech deeplearning science cloudsecurity java goal blessed fitness vegan huge thanks everyone viewed linkedin profile check add wish")

(('__label__0',), array([0.99606347]))

In [22]:
model.get_nearest_neighbors("cloudsecurity")

[(2.6689541339874268, 'itsmalware'),
 (2.081071138381958, 'usedincurrentattacks'),
 (1.729984998703003, 'bessemer'),
 (1.5863993167877197, 'necn'),
 (1.550126075744629, 'kubkon'),
 (1.546589732170105, 'attomushq'),
 (1.546589732170105, 'tobiaalberti'),
 (1.4353551864624023, 'bazalloader'),
 (1.4307018518447876, 'dsespitia'),
 (1.2304000854492188, 'jasonfossen')]

In [23]:
model.get_nearest_neighbors("nigerian")

[(2.8952574729919434, 'itsmalware'),
 (2.257476329803467, 'usedincurrentattacks'),
 (1.8773866891860962, 'bessemer'),
 (1.7206835746765137, 'necn'),
 (1.6813035011291504, 'kubkon'),
 (1.6781492233276367, 'attomushq'),
 (1.6781492233276367, 'tobiaalberti'),
 (1.5574820041656494, 'bazalloader'),
 (1.5520544052124023, 'dsespitia'),
 (1.334557294845581, 'jasonfossen')]

In [25]:
model.save_model("fast.bin")