## Reading the 3 pdfs that we will use as training data

In [1]:
pip install pypdf

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pypdf import PdfReader

reader = PdfReader("CEG4166Syllabus.pdf")
reader2 = PdfReader("CSI3131 Course Syllabus.pdf")
reader3 = PdfReader("CEG3185A _ course_outline.pdf")

def returnText(reader):
    number_of_pages = len(reader.pages)
    text = ""
    for i in range(number_of_pages):
      page = reader.pages[i]
      text += page.extract_text()
    return text
    


In [3]:
text1 = returnText(reader)
text2 = returnText(reader2)
text3 = returnText(reader3)

## Training NER

In [4]:
# Load Packages
from __future__ import unicode_literals, print_function

import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm # loading bar

In [5]:
from spacy.cli.download import download
download(model="en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
nlp1 = spacy.load('en_core_web_sm')

In [7]:
docx = nlp1(text1)

In [8]:
for token in docx.ents:
    print(token.text, token.label_)
  

EECS ORG
École de science informatique et 
de génie PERSON
SIGE ORG
3 CARDINAL
1.5 DATE
3 CARDINAL
Winter 202 DATE
3 CARDINAL
Instructor ORG
Gilbert Arbez PERSON
MS Teams ORG
613 CARDINAL
6315 DATE
1 CARDINAL
fed ORG
hese CARDINAL
2 CARDINAL
weekly DATE
lea der PERSON
2/9 3 CARDINAL
4 CARDINAL
SCHEDULE ORG
Mondays 13h00 to 14h30  
Wednesdays 11h30 to DATE
Mondays 17h30 DATE
Tuesdays 11h30 to DATE
Friday 14h30 to 17h20 DATE
Thursday 13h00 to 15h50 DATE
only one CARDINAL
Tuesdays 14h 30 to 1 5h50 DATE
Wednesdays  11h30 DATE
Saturdays  11h30 to 14h 20 
Wednesdays  16h00 to 1 8h50 DATE
only one CARDINAL
5 CARDINAL
Brightspace ORG
6-TEACHING QUANTITY
Haopeng Wang PERSON
Palwasha Waheed Shaikh PERSON
Edwin Thomas PERSON
Abhillash Paal ORG
two CARDINAL
7 CARDINAL
LECTURE PERSON
Phillip Laplante PERSON
Tools PERSON
Practitioner PERSON
Wiley PERSON
4th ORDINAL
2012 DATE
3/9 8 CARDINAL
Brightspace ORG
4 CARDINAL
Two CARDINAL
2 CARDINAL
10 CARDINAL
4 CARDINAL
11 CARDINAL
Brigh NORP
4 CARDINAL
Not

In [9]:
import re
#1st syllabus
d1 = re.search("Deliverable 1 – Draft Design.",text1)
d1s = d1.start()
d1e = d1.end()
d2 = re.search("Deliverable 2 – Reviewed design and implementation",text1)
d2s = d2.start()
d2e = d2.end()
d3 = re.search("Deliverable 3 – Demonstration of unit tests and final product",text1)
d3s = d3.start()
d3e = d3.end()
# d4 = re.search("Deliverable 4 – Final version",text1)
# d4s = d4.start()
# d4e = d4.end()
mid1 = re.search('Midterm Quiz',text1)
mid1s = mid1.start()
mid1e = mid1.end()
fin1 = re.search('Final Exam',text1)
fin1s = fin1.start()
fin1e = fin1.end()
proj1 = re.search('Project',text1)
proj1s = proj1.start()
proj1e=proj1.end()

#2nd syllabus
assignments = re.search('Assignments',text2)
assignmentss = assignments.start()
assignmentse = assignments.end()
mid2 = re.search('Midterm',text2)
mid2s = mid2.start()
mid2e = mid2.end()
fin2 = re.search('Final Exam',text2)
fin2s = fin2.start()
fin2e = fin2.end()

#3rd syllabus
labs = re.search('Labs',text3)
labss = labs.start()
labse = labs.end()

mid3 = re.search('Midterm 1',text3)
mid3s = mid3.start()
mid3e = mid3.end()

mid4 = re.search('Midterm 2',text3)
mid4s = mid4.start()
mid4e = mid4.end()

fin3 = re.search('Final Exam',text3)
fin3s = fin3.start()
fin3e = fin3.end()


In [11]:
#training data
import re
TRAIN_DATA = [(text1, {'entities': [(d1s, d1e, 'DELIVERABLE'), (d2s, d2e, 'DELIVERABLE'), (d3s, d3e, 'DELIVERABLE'), (mid1s, mid1e, 'EXAM'), (fin1s, fin1e, 'EXAM'), (proj1s, proj1e, 'DELIVERABLE')]}),
     (text2, {
        'entities': [(assignmentss, assignmentse, 'DELIVERABLE'),
                     (mid2s, mid2e,'EXAM'),
                     (fin2s, fin2e,'EXAM')]
    }),
    (text3, {
        'entities': [(labss, labse, 'DELIVERABLE'),
                     (mid3s, mid3e, 'EXAM'),
                     (mid4s, mid4e, 'EXAM'),
                     (fin3s, fin3e, 'EXAM')]
    })
]

In [12]:
model = None
output_dir = Path("C:\\Users\\This PC\\Documents\\nlpModel")
n_iter=100

In [13]:
if model is not None:
    nlp=spacy.load(model)
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')
    print("Created blank model")

Created blank model


## Setting up pipelines

In [14]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
#     ner = nlp.create_pipe('ner')
    ner  = nlp.add_pipe('ner', last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

## Training the recognizer 
https://github.com/dreji18/NER-Training-Spacy-3.0/blob/main/NER%20Training%20with%20Spacy%20v3%20Notebook.ipynb

In [15]:
import pandas as pd
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

#nlp = spacy.blank("en") # load a new spacy model
nlp = spacy.load("en_core_web_sm") # load other spacy model

db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

os.chdir(r'C:\Users\ghaza\nlp')
db.to_disk("./train.spacy") # save the docbin object

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 34.46it/s]


In [16]:
from spacy.training.example import Example

In [17]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

    # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.create_optimizer()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], losses=losses, drop=0.3)
        print(losses)

{'ner': 557.4590101250651}
{'ner': 180.83875484897516}
{'ner': 126.40840294813073}
{'ner': 39.26675892186613}
{'ner': 26.54540804466994}
{'ner': 25.05896624049315}
{'ner': 23.090473680894068}
{'ner': 21.534615926458457}
{'ner': 21.514086424433074}
{'ner': 123.08864090469531}
{'ner': 105.33404466463328}
{'ner': 85.60839599410775}
{'ner': 18.652399281430803}
{'ner': 22.685176983851406}
{'ner': 25.079620752473325}
{'ner': 22.2745975704428}
{'ner': 21.15490576638379}
{'ner': 19.28505366164323}
{'ner': 12.904758543587006}
{'ner': 13.267684798283312}
{'ner': 10.37103143788726}
{'ner': 9.694698553712067}
{'ner': 11.036047005082793}
{'ner': 41.25622917445363}
{'ner': 9.722019961840234}
{'ner': 7.92661672164706}
{'ner': 6.942452295111169}
{'ner': 5.318494005477698}
{'ner': 6.385004835452972}
{'ner': 4.848225221088654}
{'ner': 48.52949068218585}
{'ner': 36.47894923708374}
{'ner': 14.932739053709335}
{'ner': 51.260489786096144}
{'ner': 16.80549934001336}
{'ner': 36.70284300683904}
{'ner': 5.64016

## Testing the trained model

In [18]:
# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Project', 'DELIVERABLE'), ('Deliverable 1 – Draft Design.', 'DELIVERABLE'), ('Deliverable 2 – Reviewed design and implementation', 'DELIVERABLE'), ('Deliverable 3 – Demonstration of unit tests and final product', 'DELIVERABLE'), ('Midterm Quiz', 'EXAM'), ('Final Exam', 'EXAM')]
Tokens [(' \n  ', '', 2), ('1/9', '', 2), ('\n \n \n \n \n', '', 2), ('School', '', 2), ('of', '', 2), ('electrical', '', 2), ('Engineering', '', 2), ('\n', '', 2), ('and', '', 2), ('Computer', '', 2), ('Science', '', 2), (' ', '', 2), ('(', '', 2), ('EECS', '', 2), (')', '', 2), (' ', '', 2), ('École', '', 2), ('de', '', 2), ('science', '', 2), ('informatique', '', 2), ('et', '', 2), ('\n', '', 2), ('de', '', 2), ('génie', '', 2), ('électrique', '', 2), (' ', '', 2), ('(', '', 2), ('SIGE', '', 2), (')', '', 2), ('  \n \n', '', 2), ('CEG4166', '', 2), ('/', '', 2), ('CSI4141', '', 2), (' \n', '', 2), ('Real', '', 2), ('Time', '', 2), ('Systems', '', 2), ('Design', '', 2), (' ', '', 2), ('(', '', 2), 

Entities [('Midterm', 'EXAM'), ('Final Exam', 'EXAM'), ('Assignments', 'DELIVERABLE')]
Tokens [('CSI', '', 2), ('3131', '', 2), ('Winter', '', 2), ('2023', '', 2), ('\n', '', 2), ('Operating', '', 2), ('Systems', '', 2), ('\n', '', 2), ('Instructor', '', 2), (':', '', 2), ('Mohammad', '', 2), ('Alnabhan', '', 2), ('\n', '', 2), ('E', '', 2), ('-', '', 2), ('mail', '', 2), (':', '', 2), ('Malnabha@uottawa.ca', '', 2), ('\n', '', 2), ('Office', '', 2), (':', '', 2), ('STE', '', 2), ('5084', '', 2), ('\n', '', 2), ('Office', '', 2), ('Hours', '', 2), (':', '', 2), ('Monday', '', 2), ('@', '', 2), ('3:00', '', 2), ('PM', '', 2), ('–', '', 2), ('4:00', '', 2), ('PM', '', 2), ('\n', '', 2), ('Course', '', 2), ('Website', '', 2), (':', '', 2), ('Brightspace', '', 2), ('\n', '', 2), ('Enquiry', '', 2), ('Center', '', 2), (':', '', 2), ('Brightspace', '', 2), ('Discussion', '', 2), ('Group', '', 2), ('–', '', 2), ('NO', '', 2), ('EMAILSTextbook', '', 2), ('and', '', 2), ('Reading', '', 2), ('\n

## Save the Model

In [19]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(r"C:\Users\ghaza\Documents\4910 proj\NLP-Experiment")
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)


Saved model to C:\Users\ghaza\Documents\4910 proj\NLP-Experiment


## Test The Saved Model

In [20]:
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from C:\Users\ghaza\Documents\4910 proj\NLP-Experiment
Entities [('Project', 'DELIVERABLE'), ('Deliverable 1 – Draft Design.', 'DELIVERABLE'), ('Deliverable 2 – Reviewed design and implementation', 'DELIVERABLE'), ('Deliverable 3 – Demonstration of unit tests and final product', 'DELIVERABLE'), ('Midterm Quiz', 'EXAM'), ('Final Exam', 'EXAM')]
Tokens [(' \n  ', '', 2), ('1/9', '', 2), ('\n \n \n \n \n', '', 2), ('School', '', 2), ('of', '', 2), ('electrical', '', 2), ('Engineering', '', 2), ('\n', '', 2), ('and', '', 2), ('Computer', '', 2), ('Science', '', 2), (' ', '', 2), ('(', '', 2), ('EECS', '', 2), (')', '', 2), (' ', '', 2), ('École', '', 2), ('de', '', 2), ('science', '', 2), ('informatique', '', 2), ('et', '', 2), ('\n', '', 2), ('de', '', 2), ('génie', '', 2), ('électrique', '', 2), (' ', '', 2), ('(', '', 2), ('SIGE', '', 2), (')', '', 2), ('  \n \n', '', 2), ('CEG4166', '', 2), ('/', '', 2), ('CSI4141', '', 2), (' \n', '', 2), ('Real', '', 2), ('Time', '', 2), ('Sy

Entities [('Midterm', 'EXAM'), ('Final Exam', 'EXAM'), ('Assignments', 'DELIVERABLE')]
Tokens [('CSI', '', 2), ('3131', '', 2), ('Winter', '', 2), ('2023', '', 2), ('\n', '', 2), ('Operating', '', 2), ('Systems', '', 2), ('\n', '', 2), ('Instructor', '', 2), (':', '', 2), ('Mohammad', '', 2), ('Alnabhan', '', 2), ('\n', '', 2), ('E', '', 2), ('-', '', 2), ('mail', '', 2), (':', '', 2), ('Malnabha@uottawa.ca', '', 2), ('\n', '', 2), ('Office', '', 2), (':', '', 2), ('STE', '', 2), ('5084', '', 2), ('\n', '', 2), ('Office', '', 2), ('Hours', '', 2), (':', '', 2), ('Monday', '', 2), ('@', '', 2), ('3:00', '', 2), ('PM', '', 2), ('–', '', 2), ('4:00', '', 2), ('PM', '', 2), ('\n', '', 2), ('Course', '', 2), ('Website', '', 2), (':', '', 2), ('Brightspace', '', 2), ('\n', '', 2), ('Enquiry', '', 2), ('Center', '', 2), (':', '', 2), ('Brightspace', '', 2), ('Discussion', '', 2), ('Group', '', 2), ('–', '', 2), ('NO', '', 2), ('EMAILSTextbook', '', 2), ('and', '', 2), ('Reading', '', 2), ('\n