In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [14]:
with open('News.txt') as f:
    lines = f.read()

In [15]:
doc1 = nlp(lines)

for ent in doc1.ents:
    if ent.label_ == "GPE" :
        print(ent.text)

Budapest
Budapest
Fukuoka
Official Domain


In [16]:
doc2 = nlp("Economics")

print(doc1, "<->", doc2, doc1.similarity(doc2))

McKeown lowers own mark for second world record in two days - CNA
Kaylee McKeown set her second world record in two days on Saturday as the Australian lowered her own mark to win the 100 metres backstroke at the World Aquatics Swimming World Cup in Budapest.McKeown shaved 0.12 seconds off her previous best time to finish in 57.33, less than 24 hours after setting a world record in the 50 metres backstroke."Crazy scenes here in Budapest and surprised with my results to say the least," McKeown wrote on Instagram."It's been an absolute beauty of a year and what better way to top it off (than) with two world records."Excited and nervous for this coming season."In addition to setting two world records this weekend, the 22-year-old won the 50, 100 and 200 titles at the world championships in Fukuoka in July.McKeown, the reigning Olympic champion in the 100 and 200 backstroke, holds the world record in all three distances in the discipline, having set the 200 mark in March at the New South Wa

  print(doc1, "<->", doc2, doc1.similarity(doc2))


# Determine whether a risk and classify it

In [25]:
# Built a multilabel classification system using SpaCy (build and train the modek)

import pandas as pd

import sys
import os
import spacy
from spacy import displacy

from tqdm.auto import tqdm
from spacy.tokens import DocBin

In [3]:
def make_docs(train_data):

    train_data = train_data.drop(columns=["Id"])

    data = tuple(zip(train_data['Title'].tolist(), train_data['Label'].tolist())) 

    nlp = spacy.load("en_core_web_sm")

    docs = []

    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        if (label=='Nature'):
            doc.cats['Nature'] = 1
            doc.cats['Politics'] = 0
            doc.cats['Entertainment']  = 0
            doc.cats['Economics']  = 0
            doc.cats['Culture']  = 0
            doc.cats['Science']  = 0
        elif (label=='Politics'):
            doc.cats['Nature'] = 0
            doc.cats['Politics'] = 1
            doc.cats['Entertainment']  = 0
            doc.cats['Economics']  = 0
            doc.cats['Culture']  = 0
            doc.cats['Science']  = 0
        elif (label=='Entertainment'):
            doc.cats['Nature'] = 0
            doc.cats['Politics'] = 0
            doc.cats['Entertainment']  = 1
            doc.cats['Economics']  = 0
            doc.cats['Culture']  = 0
            doc.cats['Science']  = 0
        elif (label=='Economics'):
            doc.cats['Nature'] = 0
            doc.cats['Politics'] = 0
            doc.cats['Entertainment']  = 0
            doc.cats['Economics']  = 1
            doc.cats['Culture']  = 0
            doc.cats['Science']  = 0
        elif (label=='Culture'):
            doc.cats['Nature'] = 0
            doc.cats['Politics'] = 0
            doc.cats['Entertainment']  = 0
            doc.cats['Economics']  = 0
            doc.cats['Culture']  = 1
            doc.cats['Science']  = 0
        elif (label=='Science'):
            doc.cats['Nature'] = 0
            doc.cats['Politics'] = 0
            doc.cats['Entertainment']  = 0
            doc.cats['Economics']  = 0
            doc.cats['Culture']  = 0
            doc.cats['Science']  = 1        

        docs.append(doc)

    return docs,train_data

In [68]:
df = pd.read_csv("C:\Work\Programming\Hackathons\Diplomacy software\Dimensions.csv")
train = df.sample(frac=0.7, replace=False, random_state=1)

train_docs, train_data = make_docs(train)

doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./textcat_data/textcat_train.spacy")

test = df.sample(frac=0.3, replace=False, random_state=1)

test_docs, train_data = make_docs(test)

doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("./textcat_data/textcat_valid.spacy")

100%|██████████| 27/27 [00:00<00:00, 372.32it/s]
100%|██████████| 12/12 [00:00<00:00, 512.69it/s]


In [69]:
train_data

Unnamed: 0,Title,Label
2,NBA Playoffs: Lakers and Clippers Face Off in ...,Entertainment
30,Exotic Destinations for Your Next Adventure Va...,Entertainment
3,President Signs Historic Climate Change Legisl...,Politics
21,Scientific Expedition Discovers New Species in...,Science
26,High School Robotics Team Wins International C...,Politics
28,Mental Health Awareness Month: Initiatives to ...,Science
22,Innovative Schools Implement Technology-Driven...,Science
36,Wildfires Threaten Communities in the Western ...,Nature
19,Researchers Develop Promising Treatment for Al...,Science
25,Education Policy Reforms Aim to Improve Studen...,Politics


In [70]:
!python -m spacy init fill-config ./textcat_base_config.cfg ./textcat_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
textcat_config.cfg
You can now add your data and train your pipeline:
python -m spacy train textcat_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [71]:
!python -m spacy train textcat_config.cfg --verbose --output ./textcat_output --paths.train textcat_data/textcat_train.spacy --paths.dev textcat_data/textcat_valid.spacy

[2023-10-27 22:02:30,992] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2023-10-27 22:02:31,180] [INFO] Set up nlp object from config
[2023-10-27 22:02:31,187] [DEBUG] Loading corpus from path: textcat_data\textcat_valid.spacy
[2023-10-27 22:02:31,192] [DEBUG] Loading corpus from path: textcat_data\textcat_train.spacy
[2023-10-27 22:02:31,192] [INFO] Pipeline: ['tok2vec', 'textcat']
[2023-10-27 22:02:31,192] [INFO] Created vocabulary
[2023-10-27 22:02:31,587] [INFO] Added vectors: en_core_web_sm
[2023-10-27 22:02:31,587] [INFO] Finished initializing nlp object
[2023-10-27 22:02:31,852] [INFO] Initialized pipeline components: ['tok2vec', 'textcat']
[2023-10-27 22:02:31,864] [DEBUG] Loading corpus from path: textcat_data\textcat_valid.spacy
[2023-10-27 22:02:31,864] [DEBUG] Loading corpus from path: textcat_data\textcat_train.spacy
[2023-10-27 22:02:31,872] [DEBUG] Removed existing output directory: textcat_output\model-best
[2023-10-27 22:02:31,882] [DEBUG] Removed ex

[38;5;4mℹ Saving to output directory: textcat_output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.14       37.26    0.37
 77     200         84.36          1.09       66.67    0.67
177     400        482.64          0.23       66.67    0.67
277     600         19.61          0.01       66.67    0.67
418     800          0.00          0.00       66.67    0.67
618    1000          0.00          0.00       66.67    0.67
818    1200          0.00          0.00       66.67    0.67
1018    1400          0.00          0.00       66.67    0.67
1218    1600          0.00          0.00       66.67    0.67
1418    1800          0.00          0.00       66.67    0.67
[38;5;2m✔ Saved pipeline to output directory[0m
textcat_o

In [28]:
prompt = input("Enter a headline to analyse: ")
nlp_textcat = spacy.load(os.getcwd() + "\\textcat_output\\model-last")
docPred = nlp_textcat(prompt)
res = docPred.cats
print(res)
print(prompt)
print("The class of this headline is", max(res, key=res.get))

{'Nature': 0.6959365010261536, 'Politics': 2.370913716731593e-05, 'Entertainment': 2.5340261000650344e-09, 'Economics': 0.1244477927684784, 'Culture': 0.00027904147282242775, 'Science': 0.17931297421455383}
Singapore votes in favour of UN resolution to protect civilians, uphold humanitarian obligations in Gaza Strip
The class of this headline is Nature
