In [5]:
!pip3 install newsapi-python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7


In [None]:
# !python3 -m spacy download en_core_web_lg

In [6]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from newsapi import NewsApiClient
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import spacy
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# api_key = 'fa44c6cae5c94f739b7db22c2c7cc475'
# newsapi = NewsApiClient(api_key=api_key)

In [8]:
# def crawl_news(query):
#     all_results = []
#     for pag in tqdm(range(1, 6)):
#         pag_articles = newsapi.get_everything(q=query, sort_by='relevancy', page=pag)['articles']
#         if len(pag_articles) == 0:
#             break
#         all_results.extend(pag_articles)
#     return all_results

In [92]:
# tesla_news = crawl_news('tesla')

In [91]:
# print(tesla_news)

### Dataset Reading

In [18]:
df = pd.read_csv('BBC news dataset.csv', usecols=range(1, 3))

In [19]:
df.drop_duplicates('description', inplace=True)

In [20]:
descriptions = df['description'].values

### Preprocessing

In [21]:
def pre_process_text(text):

    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word

    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words

    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words
  

def ner_tagging(text):
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    words = [word for word in stripped if word.isalpha()]

    doc = nlp(text)
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words


### Event Trigger

In [26]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
processed_descriptions = []
triggered_events = []
for description in tqdm(descriptions):
    processed_descriptions.append(' '.join(pre_process_text(description)))

# Loop through the news articles and extract events
# for article in news_articles:
    # Parse the text with spaCy
    doc = nlp(description)
    
    # Extract events using spaCy's entity and dependency parsing
    for sent in doc.sents:
        for ent in sent.ents:
            if ent.label_ == 'ORG' or ent.label_ == 'PERSON':
                # Identify the subject of the sentence
                subj = ent.text
                # print(subj)
                verb = None
                obj = None
                for token in sent:
                    if token.dep_ == 'ROOT':
                        verb = token.text
                    if token.dep_ == 'dobj':
                        obj = token.text
                # Print the extracted event
                if verb and obj:
                    if sent not in triggered_events:
                      triggered_events.append(sent)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 2128/2128 [02:48<00:00, 12.62it/s]


In [34]:
print(len(triggered_events)," asd ",len(processed_descriptions))
print(type(processed_descriptions[0]))

2913  asd  2128
<class 'str'>


### Embedding

In [51]:
nlp = spacy.load('en_core_web_sm')

sent_vecs = {}
docs = []

for index, description in enumerate(tqdm(processed_descriptions)):
    doc = nlp(description)
    docs.append(doc)
    sent_vecs[index] = doc.vector

100%|██████████| 2128/2128 [01:40<00:00, 21.07it/s]


In [52]:
vectors = list(sent_vecs.values())

In [53]:
vectors = np.array(vectors)

### Clustering

In [54]:
labels_results = {}
for i in tqdm(np.arange(0.001, 1, 0.002)):
    dbscan = DBSCAN(eps=i, min_samples=5, metric='cosine').fit(vectors)
    labels_results[i] = len(pd.Series(dbscan.labels_).value_counts())

100%|██████████| 500/500 [01:02<00:00,  8.05it/s]


In [55]:
dbscan = DBSCAN(eps=0.015, min_samples=5, metric='cosine').fit(vectors)

In [56]:
results = pd.DataFrame({
    'desc': processed_descriptions, 
    'label': dbscan.labels_
})

In [57]:
results['label'].value_counts()

-1    1712
 0     399
 1      12
 2       5
Name: label, dtype: int64

In [88]:
for index in results[results['label'] == 0].index[:6]:
    print(results.loc[index]['desc'],end='\n\n')
    # print('....')

chelsea sack mutu chelsea sacked adrian mutu failed drugs test yearold tested positive banned substance later denied cocaine october chelsea decided write possible transfer fee mutu signing parma last season may face twoyear suspension statement chelsea explaining decision readwe want make clear chelsea zero tolerance policy towards drugs mutu scored six goals first five games arriving stamford bridge form went decline frozen coach jose mourinho chelseas statement added applies performanceenhancing drugs socalled recreational drugs place club sport coming decision case chelsea believed clubs social responsibility fans players employees stakeholders football regarding drugs important major financial considerations company player takes drugs breaches contract club well football association rules club totally supports fa strong action drugs cases fifas disciplinary code stipulates first doping offence followed sixmonth ban sports world governing body reiterated stance mutus failed drugs t

In [86]:
results.head()

Unnamed: 0,desc,label
0,chelsea sack mutu chelsea sacked adrian mutu f...,0
1,record fails lift lacklustre meet yelena isinb...,-1
2,edu describes tunnel fracas arsenals edu lifte...,-1
3,ogara revels ireland victory ireland flyhalf r...,-1
4,unclear future striker baros liverpool forward...,-1


### Prediction

In [12]:
import spacy

nlp = spacy.load('en_core_web_sm')

news_articles = [
    'Moldova’s unrecognised breakaway region of Transnistria says it wants Moscow to bulk up its small contingent of peacekeepers because of what it called growing security risks, Russia’s state-owned RIA news agency reported. Though Moldova does not allow Russia to deploy new troops in Transnistria, Russia has had hundreds of peacekeepers in the region since a bloody war between pro-Russian separatists and Moldovan government forces after the 1991 Soviet breakup. “As long as Russia’s peacekeeping mission continues, Moldova is constrained in any military plans and preparations against Transnistria,” Leonid Manakov, the region’s envoy to Moscow, was cited as saying by RIA. “Transnistria has repeatedly applied for an increase in the number of Russian peacekeepers … There is such an option, and it is justified in terms of the worsening security risks,” he was quoted as saying. Moldova’s ties with Russia are currently badly strained and have deteriorated rapidly over the course of Moscow’s full-scale invasion of neighbouring Ukraine, which Chisinau has repeatedly condemned.',
    # 'The majority judges held that parliament which is competent to destroy a State cannot be held, on the theory of absolute sovereignty of the States, to be incompetent to acquire by legislation the property owned by the States. Even if the constitution were held to be a Federation and the States regarded qua the Union as sovereign, the power of the Union to legislate in respect of the property situated in the States would remain unrestricted, they held. It shows that the majority judges too believed that it is not incorrect to suggest that States and the Union can be regarded as sovereign together. ',
    # 'What appears to militate against the theory regarding the sovereignty of the State is the wide power with which the Parliament is invested to alter the boundaries of States, and even to extinguish the existence of  a State. There is no Constitutional guarantee against alteration of the boundaries of the States. By Article 2 of the Constitution, the Parliament may admit into the Union or establish new States on such terms and conditions as it thinks fit, and by Article 3 the Parliament is by law authorised to form a new State by redistribution of the territory of a State or by uniting two or more States or parts of States or by uniting any territory to a part of any State, increase the area of any State, diminish the area of any State, alter the boundaries of any State, and alter the name of any State',
    # 'The majority judges held that there could be no doubt that if the Union did so, it would not be using but abusing its power of acquisition, but the fact that a power is capable of being abused has never been in law a reason for denying its existence, for its existence has to be determined on very different considerations.',
    # '',
    '',
]

events_ = []

# Loop through the news articles and extract events
for article in news_articles:
    # Parse the text with spaCy
    doc = nlp(article)
    
    # Extract events using spaCy's entity and dependency parsing
    for sent in doc.sents:
    
        for ent in sent.ents:
            if ent.label_ == 'ORG' or ent.label_ == 'PERSON':
                # Identify the subject of the sentence
                subj = ent.text
                print(subj)
                verb = None
                obj = None
                for token in sent:
                    if token.dep_ == 'ROOT':
                        verb = token.text
                    if token.dep_ == 'dobj':
                        obj = token.text
                # Print the extracted event
                if verb and obj:
                    if sent not in events_:
                      events_.append(sent)
                    # print(f'{subj} {verb} {obj}')


RIA news agency
Moldova
Moldovan
Transnistria
Leonid Manakov
RIA
Transnistria
Moldova
Chisinau


In [89]:
print(len(events_))
strng = []
for eventss in events_:
  # eventss = (eventss).astype(str)
  strng.append(''.join(eventss.text))


3


In [74]:
pred_vecs = {}
pred_docs = []
for index, description in enumerate(tqdm(strng)):
    print(index)
    doc = nlp(description)
    pred_docs.append(doc)
    pred_vecs[index] = doc.vector

vectors_c = list(pred_vecs.values())
vectors_c = np.array(vectors_c)
# classifier results = dbscan.fit_predict(vectors_c)


100%|██████████| 3/3 [00:00<00:00, 69.15it/s]

0
1
2





In [76]:
print(vectors_c.shape)

(3, 96)


In [77]:
def dbscan_predict(model, X):

    nr_samples = X.shape[0]
    # print(nr_samples)
    y_new = np.ones(shape=nr_samples, dtype=int) * -1

    for i in range(nr_samples):
        diff = model.components_ - X[i, :]  # NumPy broadcasting

        dist = np.linalg.norm(diff, axis=1)  # Euclidean distance

        shortest_dist_idx = np.argmin(dist)

        if dist[shortest_dist_idx] < model.eps:
            y_new[i] = model.labels_[model.core_sample_indices_[shortest_dist_idx]]

    return y_new

In [78]:
output = dbscan_predict(dbscan,vectors_c)

3


In [84]:
prediction_results = pd.DataFrame({
    'desc': strng, 
    'label': output
})

In [85]:
prediction_results.head()

Unnamed: 0,desc,label
0,Moldova’s unrecognised breakaway region of Tra...,-1
1,Though Moldova does not allow Russia to deploy...,-1
2,Moldova’s ties with Russia are currently badly...,-1
