In [1]:
from modules.elastic import ArticleSearchQuery
from modules.objects import FullArticle
from modules.config import BaseConfig

from dotenv import load_dotenv

load_dotenv()

config_options = BaseConfig()

In [2]:
from span_marker import SpanMarkerModel

# Download from the 🤗 Hub
model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-roberta-large-ontonotes5")

In [3]:
articles = [config_options.es_article_client.query_documents(ArticleSearchQuery(limit=10), True)[0][0]]

In [None]:
entities = []

for article in articles:
    current_entities = []
    for entity_collection in [model.predict(sentence) for sentence in article.content.split(". ")]:
        current_entities.extend(entity_collection)
    entities.append(current_entities)

In [4]:
from datasets import load_dataset, Dataset
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt')

documents = [sent_tokenize(article.content) for article in articles]

data_dict = {
    "tokens": [],
    "document_id": [],
    "sentence_id": [],
}
for document_id, document in enumerate(documents):
    for sentence_id, sentence in enumerate(document):
        data_dict["document_id"].append(document_id)
        data_dict["sentence_id"].append(sentence_id)
        data_dict["tokens"].append(sentence)
dataset = Dataset.from_dict(data_dict)
dataset.to_pandas()

for document in documents:
    print(min([len(sen) for sen in document]))

2


[nltk_data] Downloading package punkt to /home/bertmad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-xlm-roberta-large-conll03-doc-context")
entities = model.predict(dataset)

In [6]:
articles

[FullArticle(id='d8adcf515ca2f2f1106e9f276b0158b8', title='Putting the 10 Most Critical AppSec Risks that threaten Your Business', description='Every business is under the threat of a data breach. Identity Theft Resource Centre (ITRC) has published a\xa0report\xa0which shows 17% increase in data breaches as of September 2021.', url=Url('https://cybersecuritynews.com/?p=8529'), image_url=Url('https://blogger.googleusercontent.com/img/a/AVvXsEiJUKVvQ5nVuCqdYxOIexGOA2QhZdSxv4h3Q9j3ZjMpA8jSFLyyntnSxJcrzo7wj22LG_Fkl0BVga-UXDYvHjphGveo2UcmxQ2naVhGAnsOUMHBstTqs952kzvbEObv-TqbWm08bbbDN41jGFeTFzT8VO5UZFTMjiLIRswm7SAX6ld6CF7TfU7iUXjSpQ=s16000'), profile='cybersecuritynews', source='Cyber Security News', author='Guru', publish_date=datetime.datetime(2022, 1, 28, 17, 14, 17, tzinfo=TzInfo(UTC)), inserted_at=datetime.datetime(2022, 2, 7, 20, 51, 34, tzinfo=TzInfo(UTC)), read_times=0, similar=['d9571274b7cf9e10c971c422d45a5313', '1f24985c7240f04aba448b1c6232c0a6', '97b2d82e677728bcf119fc2f92557951',

In [71]:
index = 0
article = articles[index]
entity = entities[index]

for r in current_entities:
    if r['label'] in ["DATE", "CARDINAL"]:
        continue
        
    print(f"{r['span']}: {r['label']} | {str(r['score'])[0:5]}")

print(article.content)
print(article.id)

 Microsoft: ORG | 0.997
Windows: PRODUCT | 0.974
Office: PRODUCT | 0.969
Teams: PRODUCT | 0.967
Azure Data Explorer: PRODUCT | 0.990
Visual Studio Code: PRODUCT | 0.994
Kernel: PRODUCT | 0.823
Win32k: PRODUCT | 0.915
Chromium: PRODUCT | 0.587
Edge: PRODUCT | 0.993
CVE-2022-21989: LAW | 0.632
Windows Kernel: PRODUCT | 0.830
Microsoft: ORG | 0.998
Windows DNS Server: PRODUCT | 0.930
SharePoint Server: PRODUCT | 0.963
Windows Hyper-V: PRODUCT | 0.936
HEVC Video Extensions: PRODUCT | 0.709
Azure Data Explorer: PRODUCT | 0.950
Outlook for Mac: PRODUCT | 0.966
OneDrive for Android: PRODUCT | 0.835
.NET: PRODUCT | 0.772
Teams: PRODUCT | 0.888
Microsoft: ORG | 0.998
Print Spooler: PRODUCT | 0.775
Win32k: PRODUCT | 0.922
WinVerifyTrust: PRODUCT | 0.984
Windows: PRODUCT | 0.946
ZLoader: PRODUCT | 0.922
Check Point Research: ORG | 0.996
Microsoft: ORG | 0.998
Oracle Linux: ORG | 0.537
Red Hat: ORG | 0.993
SUSE: ORG | 0.991
Firefox ESR: PRODUCT | 0.609
Siemens: ORG | 0.998
a19d66c10e6d6239dfce7cd4

In [65]:
entities

[[{'span': 'AT&T Alien Labs',
   'label': 'ORG',
   'score': 0.9983267188072205,
   'char_start_index': 20,
   'char_end_index': 35},
  {'span': 'Shikitega',
   'label': 'PRODUCT',
   'score': 0.9089323282241821,
   'char_start_index': 93,
   'char_end_index': 102}],
 [],
 [{'span': 'Shikitega',
   'label': 'PRODUCT',
   'score': 0.7931892275810242,
   'char_start_index': 0,
   'char_end_index': 9}],
 [{'span': 'Metasploit',
   'label': 'PRODUCT',
   'score': 0.8551660776138306,
   'char_start_index': 61,
   'char_end_index': 71},
  {'span': 'Mettle',
   'label': 'PRODUCT',
   'score': 0.8974155783653259,
   'char_start_index': 75,
   'char_end_index': 81}],
 [{'span': 'Shikitega',
   'label': 'PRODUCT',
   'score': 0.859973132610321,
   'char_start_index': 0,
   'char_end_index': 9}],
 [],
 [{'span': 'Metasploit',
   'label': 'PRODUCT',
   'score': 0.910469114780426,
   'char_start_index': 132,
   'char_end_index': 142}],
 [{'span': 'one',
   'label': 'CARDINAL',
   'score': 0.9145220

In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("CyberPeace-Institute/SecureBERT-NER")
model = AutoModelForTokenClassification.from_pretrained("CyberPeace-Institute/SecureBERT-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/815k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/473k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [16]:
ner_results = nlp(articles[1].content)

for result in ner_results:
    print(result)

{'entity': 'B-SECTEAM', 'score': 0.9150467, 'index': 4, 'word': 'ĠAT', 'start': 20, 'end': 22}
{'entity': 'B-SECTEAM', 'score': 0.9164499, 'index': 5, 'word': '&', 'start': 22, 'end': 23}
{'entity': 'B-SECTEAM', 'score': 0.9194493, 'index': 6, 'word': 'T', 'start': 23, 'end': 24}
{'entity': 'I-SECTEAM', 'score': 0.8964916, 'index': 7, 'word': 'ĠAlien', 'start': 25, 'end': 30}
{'entity': 'I-SECTEAM', 'score': 0.888786, 'index': 8, 'word': 'ĠLabs', 'start': 31, 'end': 35}
{'entity': 'B-OS', 'score': 0.90799487, 'index': 15, 'word': 'ĠLinux', 'start': 71, 'end': 76}
{'entity': 'B-MAL', 'score': 0.46373838, 'index': 19, 'word': 'ĠSh', 'start': 93, 'end': 95}
{'entity': 'B-MAL', 'score': 0.52477324, 'index': 20, 'word': 'ik', 'start': 95, 'end': 97}
{'entity': 'B-MAL', 'score': 0.5661931, 'index': 21, 'word': 'ite', 'start': 97, 'end': 100}
{'entity': 'B-MAL', 'score': 0.570699, 'index': 22, 'word': 'ga', 'start': 100, 'end': 102}
{'entity': 'B-TOOL', 'score': 0.5195549, 'index': 28, 'word'

In [15]:
articles[0].content

'Every business is under the threat of a data breach. Identity Theft Resource Centre (ITRC) has published a report which shows 17% increase in data breaches as of September 2021. Every sector has its own set of threats. As per records, the manufacturing & utility sector had 48 million victims in 2021, which was the highest. Another study showed that nearly 100 million Android users’ data were leaked due to several misconfigurations.This shows that application security has become a significant problem for businesses. This article provides you with the Top 10 most critical application security risks you need to concentrate on to avoid data breaches. 1.    Broken Access ControlRecent research revealed that, in 2021, nearly 94% of the applications were vulnerable to Broken Access Control. Access Control is enforced in every organization to prevent unauthorized access to sensitive information. However, when the policies are misconfigured, it might lead to many security issues. The most comm