In [1]:
import spacy

In [2]:
nlp = spacy.blank('en')

In [3]:
doc = nlp("Federick took out a portion of his salary to get meatpie. He is now broke")

nlp.pipe_names


[]

In [4]:
#add the english pipeline

nlp = spacy.load('en_core_web_sm')
doc = nlp("Federick took out a portion of his salary to get meatpie. He is now broke")

nlp.pipe_names


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1fda5efd310>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1fda5effe90>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1fda6052110>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1fda6343b50>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1fda61aa410>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1fda60515b0>)]

In [6]:
for token in doc:
    print(token, '|', token.pos_, '|', token.lemma_)

Federick | PROPN | Federick
took | VERB | take
out | ADP | out
a | DET | a
portion | NOUN | portion
of | ADP | of
his | PRON | his
salary | NOUN | salary
to | PART | to
get | VERB | get
meatpie | PROPN | meatpie
. | PUNCT | .
He | PRON | he
is | AUX | be
now | ADV | now
broke | VERB | break


In [7]:
#pos gives information on the part of speech the word belongs to (tagger)
#lemma gives information of the base word it originates from (lammer)

In [8]:
doc = nlp("Microsft Inc acquired an investment worth $500 billion")
#we can obtain the category of words using ent, and can also explain the category (named entity recognization, ner)

for entity in doc.ents:
    print(entity.text, "|", entity.label_, "|", spacy.explain(entity.label_))

Microsft Inc | ORG | Companies, agencies, institutions, etc.
$500 billion | MONEY | Monetary values, including unit


In [9]:
from spacy import displacy
displacy.render(doc, style='ent')

In [10]:
#you can add specific components to your blank pipeline. Let's say we only want tagger component
source_nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank('en')
nlp.add_pipe('tagger', source=source_nlp)
nlp.pipe_names

['tagger']

In [23]:
##extract the matric numbers of mechatronics students from the level attendance pdf document
import pdfplumber
with pdfplumber.open(r"C:\Users\HP\Desktop\500 Level Attendance .pdf") as pdf_doc:
    text = "\n".join([page.extract_text() for page in pdf_doc.pages])
text

'MECHATRONICS 500 LEVEL\nMatric Fullname Week1 Week2 Week3 Week4 Week5 Week6 Week7 Week8 Week9 Week10 Week11 Week12 Week13\nNo\n*2020/8975 OBOH JORDAN OGHENEFEGO\n*2020/8977 ODEY EMMANUEL OBULE\n*2020/8994 SULOLA LAWRENCE OWOMITOLA\n*2020/8995 OLORUNTOBA VICTOR AYOMIDEJI\n*2020/8996 LEKWA-UZO ONYEDIKACHI NAOMI\n*2020/9035 ADEGBOYEGA PRECIOUS OLUSEGUN\n*2020/9051 ILORI JOSHUA AYOMIDE\n*2020/9061 KEHINDE JOY ABIOLA\n*2020/9081 JESUSINA TEMILOLUWA DANIEL\n*2020/9082 OTTAILOBHEGBE SAMUEL OKHIRIA\n*2020/9089 BLESSING GIDEON OGHENEKOME\n*2020/9092 ODOCK NEPHI NCHOR\n*2020/9104 EJERE BESTON OJE\n*2020/9119 ADEBOWALE AYODEJI AUGUSTINE\n*2020/9126 ALABI OLUWAFOLAHAN DAVID\n*2020/9141 IFE ALPHA OLASUNKANMI\n*2020/9142 OBA HANEEFAH OLUWATOYIN\n*2020/9147 AYEOMONI AYOMIDE OLUMUYIWA\n*2020/9152 OSINAIKE ONAOPEMIPO BABAFOLAYEMISI\n*2020/9163 ERINLE DAVID OYINDAMOLA\n*2020/9183 HARRY ISAAC JIM\n*2020/9187 ASIWAJU OLAWALE SAMSON\n*2020/9188 SOTONWA SAMUEL FRANKINCENSE\n*2020/9198 SHOBAYO OLASUNKANMI I

In [24]:
doc = nlp(text)
for token in doc:
    if token.like_num:
        print(token.text)
    

500
2020/8975
2020/8977
2020/8994
2020/8995
2020/8996
2020/9035
2020/9051
2020/9061
2020/9081
2020/9082
2020/9089
2020/9092
2020/9104
2020/9119
2020/9126
2020/9141
2020/9142
2020/9147
2020/9152
2020/9163
2020/9183
2020/9187
2020/9188
2020/9198
2020/9202
2020/9207
2020/9216
2020/9236
2020/9238
2020/9278
2020/9280
2020/9282
2020/9290
2020/9317
2020/9319
2020/9320
2020/9326
2020/9329
2020/9339
2020/9342
2020/9348
2020/9351
2020/9361
2020/9377
2020/9387
2020/9411
2020/9414
2020/9415
2020/9417
2020/9420
2020/9422
2020/9425
2020/9433
2020/9436
2020/9469
2020/9477
2020/9478
2020/9530
2020/9535
2020/9537
2020/9565
2020/9583
2020/9594
2020/9598
2020/9606
2020/9631
2020/9632
2020/9643
2020/9645
2020/9648
2020/9662
2020/9671
2020/9677
2020/9688
2020/9700
2020/9719
2020/9732
2020/9750
2020/9757
2020/9824
2020/9873
2021/10441
