In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("data/All-in-one2.csv")
df.head()

Unnamed: 0,Title,Summary,Classification,Human_Safety,Infrastructure,Communication,Finance,Controllable
0,Teen charged with sexual assault of young girl...,A 19-year-old man has been charged over the al...,Physical,Not safe,No impact,No impact,No impact,Yes
1,Covert war crimes inquiry compromised by forme...,A corruption investigation was launched after ...,No threat,Safe,No impact,No impact,No impact,Yes
2,"Cyber-attack targeted 186,000 Service NSW cust...",A cyber-security breach at Service NSW earlier...,Cyber,Possible safe,No impact,Impact,Impact,No
3,Australians involved in online porn data breach,A data leak from an international pornography ...,Cyber,Safe,No impact,Impact,No impact,Yes
4,AFL fans' private chats exposed in 'mass data ...,A large data leak from an AFL fan website has ...,Cyber,Safe,No impact,Impact,Impact,No


In [3]:
vectorizer = TfidfVectorizer(max_features = 5000)

# Text classification

We will need to classify each of the below columns individually to then calculate the threat level(%).
1. Classification (Physical, Cyber, No threat)
2. Human Safety (Safe, Possible Safe, Not Safe)
3. Infrastructure (No impact, Imapct)
4. Communication (No impact, Imapct)
5. Finance (No impact, Imapct)
6. Controllable (Yes, No)

## 1. Threat Classifier

In [4]:
# X is set the same for all six models as we will be using the summary to 
# predict/classify the values required to calculate the threat level
X = vectorizer.fit_transform(df.Summary).toarray()
enc_cl = LabelEncoder()
y = enc_cl.fit_transform(df.Classification)
X_train, X_test, y_train_clf, y_test_clf = train_test_split(X, y, random_state=42)

In [5]:
target_names_cl = enc_cl.inverse_transform([0, 1, 2])
target_names_cl

array(['Cyber', 'No threat', 'Physical'], dtype=object)

In [6]:
clf_mnb = MultinomialNB()
clf_mnb.fit(X_train, y_train_clf)

MultinomialNB()

In [7]:
y_pred_mnb = clf_mnb.predict(X_test)

In [8]:
enc_cl.inverse_transform(y_pred_mnb)

array(['Physical', 'Physical', 'Cyber', 'Physical', 'Cyber', 'Cyber',
       'Cyber', 'Cyber', 'Cyber', 'Physical', 'Cyber', 'Cyber', 'Cyber',
       'Physical', 'Physical', 'Cyber', 'Cyber', 'Physical', 'Physical',
       'Physical', 'Cyber'], dtype=object)

In [9]:
enc_cl.inverse_transform(y_test_clf)

array(['Physical', 'Physical', 'Cyber', 'Physical', 'Cyber', 'Cyber',
       'Cyber', 'Cyber', 'Cyber', 'Physical', 'Cyber', 'Cyber', 'Cyber',
       'Physical', 'No threat', 'Cyber', 'Physical', 'Physical',
       'Physical', 'No threat', 'Cyber'], dtype=object)

In [10]:
# Evaluating model performance
print(classification_report(y_test_clf, y_pred_mnb, target_names = target_names_cl))

              precision    recall  f1-score   support

       Cyber       0.92      1.00      0.96        11
   No threat       0.00      0.00      0.00         2
    Physical       0.78      0.88      0.82         8

    accuracy                           0.86        21
   macro avg       0.56      0.62      0.59        21
weighted avg       0.78      0.86      0.81        21



  _warn_prf(average, modifier, msg_start, len(result))


## 2. Human Safety Classifier

In [11]:
enc_hs = LabelEncoder()
y = enc_hs.fit_transform(df['Human_Safety'])
X_train, X_test, y_train_hs, y_test_hs = train_test_split(X, y, random_state=42)
target_names_hs = enc_hs.inverse_transform([0, 1, 2])
hs_mnb = MultinomialNB()
hs_mnb.fit(X_train, y_train_hs)
y_pred_mnb = hs_mnb.predict(X_test)
# Evaluating model performance
print(classification_report(y_test_hs, y_pred_mnb, target_names = target_names_hs))

               precision    recall  f1-score   support

     Not safe       1.00      0.14      0.25         7
Possible safe       1.00      0.17      0.29         6
         Safe       0.42      1.00      0.59         8

     accuracy                           0.48        21
    macro avg       0.81      0.44      0.38        21
 weighted avg       0.78      0.48      0.39        21



## 3. Infrastructure

In [12]:
enc_i = LabelEncoder()
y = enc_i.fit_transform(df['Infrastructure'])
X_train, X_test, y_train_i, y_test_i = train_test_split(X, y, random_state=42)
target_names_i = enc_i.inverse_transform([0, 1])
i_mnb = MultinomialNB()
i_mnb.fit(X_train, y_train_i)
y_pred_mnb = i_mnb.predict(X_test)
# Evaluating model performance
print(classification_report(y_test_i, y_pred_mnb, target_names = target_names_i))

              precision    recall  f1-score   support

      Impact       0.00      0.00      0.00         2
   No impact       0.90      1.00      0.95        19

    accuracy                           0.90        21
   macro avg       0.45      0.50      0.48        21
weighted avg       0.82      0.90      0.86        21



  _warn_prf(average, modifier, msg_start, len(result))


## 4. Communication

In [13]:
enc_c = LabelEncoder()
y = enc_c.fit_transform(df['Communication'])
X_train, X_test, y_train_c, y_test_c = train_test_split(X, y, random_state=42)
target_names_c = enc_c.inverse_transform([0, 1])
c_mnb = MultinomialNB()
c_mnb.fit(X_train, y_train_c)
y_pred_mnb = c_mnb.predict(X_test)
# Evaluating model performance
print(classification_report(y_test_c, y_pred_mnb, target_names = target_names_c))

              precision    recall  f1-score   support

      Impact       1.00      0.70      0.82        10
   No impact       0.79      1.00      0.88        11

    accuracy                           0.86        21
   macro avg       0.89      0.85      0.85        21
weighted avg       0.89      0.86      0.85        21



## 5. Finance

In [14]:
enc_f = LabelEncoder()
y = enc_f.fit_transform(df['Finance'])
X_train, X_test, y_train_f, y_test_f = train_test_split(X, y, random_state=42)
target_names_f = enc_f.inverse_transform([0, 1])
f_mnb = MultinomialNB()
f_mnb.fit(X_train, y_train_f)
y_pred_mnb = f_mnb.predict(X_test)
# Evaluating model performance
print(classification_report(y_test_f, y_pred_mnb, target_names = target_names_f))

              precision    recall  f1-score   support

      Impact       0.00      0.00      0.00         7
   No impact       0.67      1.00      0.80        14

    accuracy                           0.67        21
   macro avg       0.33      0.50      0.40        21
weighted avg       0.44      0.67      0.53        21



  _warn_prf(average, modifier, msg_start, len(result))


## 6. Controllable

In [15]:
enc_con = LabelEncoder()
y = enc_con.fit_transform(df['Controllable'])
X_train, X_test, y_train_con, y_test_con = train_test_split(X, y, random_state=42)
target_names_con = enc_con.inverse_transform([0, 1])
con_mnb = MultinomialNB()
con_mnb.fit(X_train, y_train_con)
y_pred_mnb = con_mnb.predict(X_test)
# Evaluating model performance
print(classification_report(y_test_con, y_pred_mnb, target_names = target_names_con))

              precision    recall  f1-score   support

          No       0.00      0.00      0.00        11
         Yes       0.48      1.00      0.65        10

    accuracy                           0.48        21
   macro avg       0.24      0.50      0.32        21
weighted avg       0.23      0.48      0.31        21



  _warn_prf(average, modifier, msg_start, len(result))


## Example
Now we are going to try our classification models on a sample text data.

In [16]:
# This is an original tweet by NSW police on September 12th, 2020
text1 = ["Have you seen Darren Macquarie (55yo)? Last seen leaving a hospital in Westmead yesterday (Friday 11 September 2020). He is considered dangerous and the public is urged to not approach the man but instead contact Crime Stoppers on 1800 333 000. https://t.co/HnrwcTXVWC https://t.co/EKdnDfSoJD"]

In [17]:
text2 = ["Hackers got into Macquarie university systems, few student details breached"]

In [18]:
text_vectorizer1 = vectorizer.transform(text1)
text_vectorizer2 = vectorizer.transform(text2)

In [19]:
cl1 = enc_cl.inverse_transform(clf_mnb.predict(text_vectorizer1))
cl2 = enc_hs.inverse_transform(hs_mnb.predict(text_vectorizer1))
cl3 = enc_i.inverse_transform(i_mnb.predict(text_vectorizer1))
cl4 = enc_c.inverse_transform(c_mnb.predict(text_vectorizer1))
cl5 = enc_f.inverse_transform(f_mnb.predict(text_vectorizer1))
cl6 = enc_con.inverse_transform(con_mnb.predict(text_vectorizer1))

In [20]:
print("The first text has been classified as a", cl1[0], "threat.")
print("Human Safety: ", cl2[0])
print("Infrastructure: ", cl3[0])
print("Communication: ", cl4[0])
print("Finance: ", cl5[0])
print("Controllable: ", cl6[0])

The first text has been classified as a Physical threat.
Human Safety:  Safe
Infrastructure:  No impact
Communication:  No impact
Finance:  No impact
Controllable:  Yes


In [21]:
cl1 = enc_cl.inverse_transform(clf_mnb.predict(text_vectorizer2))
cl2 = enc_hs.inverse_transform(hs_mnb.predict(text_vectorizer2))
cl3 = enc_i.inverse_transform(i_mnb.predict(text_vectorizer2))
cl4 = enc_c.inverse_transform(c_mnb.predict(text_vectorizer2))
cl5 = enc_f.inverse_transform(f_mnb.predict(text_vectorizer2))
cl6 = enc_con.inverse_transform(con_mnb.predict(text_vectorizer2))

In [22]:
print("The second text has been classified as a", cl1[0], "threat.")
print("Human Safety: ", cl2[0])
print("Infrastructure: ", cl3[0])
print("Communication: ", cl4[0])
print("Finance: ", cl5[0])
print("Controllable: ", cl6[0])

The second text has been classified as a Cyber threat.
Human Safety:  Safe
Infrastructure:  No impact
Communication:  No impact
Finance:  No impact
Controllable:  Yes


In [23]:
def TP_calc(doc, src, verf):
    s = 2
    v = 2
    if(src == "Twitter"):
        src = 1
        if(verf == "FALSE"):
            v = 1
    t_vec = vectorizer.transform(doc)
    cl = clf_mnb.predict(t_vec)
    hs = hs_mnb.predict(t_vec)
    i = i_mnb.predict(t_vec)
    c = c_mnb.predict(t_vec)
    f = f_mnb.predict(t_vec)
    con = con_mnb.predict(t_vec)
    
    if(cl == 0):
        cl = 1
    if(cl == 1):
        cl = 0
    
    if(hs == 0):
        hs = 2
    if(hs == 2):
        hs = 0
        
    if(i == 0):
        i = 1
    if(i == 1):
        i = 0
        
    if(c == 0):
        c = 1
    if(c == 1):
        c = 0
        
    if(f == 0):
        f = 1
    if(f == 1):
        f = 0
    
    if(con == 0):
        con = 1
    if(con == 1):
        con = 0
    print(hs, con, i, c, f, cl, s, v)
    return (0.2*hs + 0.15*con + 0.06*i + 0.06*c + 0.06*f + 0.07*cl + 0.04*s + 0.025*v)


In [24]:
print(TP_calc(text1, "Twitter", "TRUE")*100, "%")

0 0 0 0 0 [2] 2 2
[27.] %


In [25]:
print(TP_calc(text2, "Twitter", "TRUE")*100, "%")

0 0 0 0 0 0 2 2
13.0 %
