<a href="https://colab.research.google.com/github/ScholliYT/manim-named-entity-recognition/blob/main/PG_Seminar_NLP_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt update
!sudo apt install libcairo2-dev ffmpeg \
    texlive texlive-latex-extra texlive-fonts-extra \
    texlive-latex-recommended texlive-science \
    tipa libpango1.0-dev
!pip install manim manim-presentation flair
!pip install IPython --upgrade

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:14 http://ppa.launchpad.net



In [None]:
from manim import *
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
from manim_presentation import Slide
from typing import List

from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
config.background_color = BLACK
config.media_width = "100%"
text_color = WHITE

# Titlepage

In [231]:
%%manim -qm -v WARNING NamedEntityRecognition

class NamedEntityRecognition(Slide):

    def add_title(self):
        intro_words1 = Text("Named Entity Recognition", gradient=(BLUE, BLUE_D), should_center=True).scale(1.5).to_edge(UP)
        self.add(intro_words1)
    
    def construct(self):
        global text_color

        self.add_title()
        self.wait()
        self.pause()

        working_principle = Text("Funktionsweise", color=text_color).shift(UP)
        self.play(Write(working_principle))
        self.wait()
        self.pause()

        state_of_the_art = Text("State-of-the-art", color=text_color).next_to(working_principle, DOWN).shift(DOWN)
        self.play(Write(state_of_the_art))
        self.pause()
        self.wait()

        frameworks = Text("Frameworks", color=text_color).next_to(state_of_the_art, DOWN).shift(DOWN)
        self.play(Write(frameworks))
        self.pause()
        self.wait()



# Problem Statement

In [None]:
# load the NER tagger
tagger = SequenceTagger.load('de-ner')

In [251]:
example_text = "George Washington ging nach Washington"

sentence = Sentence(example_text)
# run NER over sentence
tagger.predict(sentence)

In [252]:
print(sentence)
print('The following NER tags are found:')

sentence.get_spans()

Sentence: "George Washington ging nach Washington"   [− Tokens: 5  − Token-Labels: "George <B-PER> Washington <E-PER> ging nach Washington <S-LOC>"]
The following NER tags are found:


[<PER-span (1,2): "George Washington">, <LOC-span (5): "Washington">]

In [253]:
%%manim -qm -v WARNING Problem

class Problem(Slide):

    def add_title(self):
        title = Text("Problem", gradient=(BLUE, BLUE_D), should_center=True).scale(1.5).to_edge(UP)
        self.add(title)
    
    def construct(self):
        global text_color

        self.add_title()
        self.wait()
        self.pause()

        text = Text(example_text, color=text_color).scale(0.8)
        self.play(Write(text))
        self.wait()
        self.pause()

        for entity in sentence.to_dict('ner')['entities']:
          start_offset = example_text[0:entity['start_pos']].count(" ")
          end_offset = example_text[0:entity['end_pos']].count(" ")
          fixed_start_pos = entity['start_pos'] - start_offset
          fixed_end_pos = entity['end_pos'] - end_offset
          print(entity, fixed_start_pos, fixed_end_pos, example_text.replace(" ", "")[fixed_start_pos:fixed_end_pos])


          tag_type = entity['labels'][0].value
          if tag_type == "PER":
            person_framebox = BackgroundRectangle(text[fixed_start_pos:fixed_end_pos], buff=.05, color=GREEN)
            person_label = Text("Person", color=GREEN).scale(0.8).next_to(person_framebox, DOWN)
            self.play(Create(person_framebox), Write(person_label))
          elif tag_type == "LOC": 
            location_framebox = BackgroundRectangle(text[fixed_start_pos:fixed_end_pos], buff=.05, color=RED)
            location_label = Text("Location", color=RED).scale(0.8).next_to(location_framebox, DOWN)
            self.play(Create(location_framebox), Write(location_label))
          elif tag_type == "ORG": 
            org_framebox = BackgroundRectangle(text[fixed_start_pos:fixed_end_pos], buff=.05, color=BLUE)
            org_label = Text("Organization", color=BLUE).scale(0.8).next_to(org_framebox, DOWN)
            self.play(Create(org_framebox), Write(org_label))
          self.wait()
          self.pause()



{'text': 'George Washington', 'start_pos': 0, 'end_pos': 17, 'labels': [PER (0.9977)]} 0 16 GeorgeWashington




{'text': 'Washington', 'start_pos': 28, 'end_pos': 38, 'labels': [LOC (0.9895)]} 24 34 Washington




# Motivation

In [None]:
example_text = "Max, Moritz, Anna und Nele fahren nach Köln"

# run NER over sentence
sentence = Sentence(example_text)
tagger.predict(sentence)

sentence.get_spans()

[<PER-span (1): "Max">,
 <PER-span (3): "Moritz">,
 <PER-span (5): "Anna">,
 <PER-span (7): "Nele">,
 <LOC-span (10): "Köln">]

In [None]:
%%manim -qm -v WARNING Motivation

class Motivation(Slide):

    def add_title(self):
        title = Text("Motivation", gradient=(BLUE, BLUE_D), should_center=True).scale(1.5).to_edge(UP)
        self.add(title)
    
    def construct(self):
        global text_color

        self.add_title()
        self.wait()
        self.pause()
        

        processing_pipeline_text = Text("Erster Teil einer Processing Pipeline", color=text_color)
        self.play(Write(processing_pipeline_text))
        self.wait()
        self.pause()

        processing_pipeline_text.generate_target()
        processing_pipeline_text.target.shift(2*UP)
        processing_pipeline_text.target.set_color(GRAY)
        processing_pipeline_text.target.scale(0.5)
        self.play(MoveToTarget(processing_pipeline_text))
        self.wait()
        self.pause()


        text = Text(example_text, color=text_color).scale(0.8)
        self.play(Write(text))
        self.wait()
        self.pause()


        question_text = Text("Wie viele Personen fahren nach Köln?", color=text_color).scale(0.7).shift(DOWN)
        self.play(Write(question_text))
        self.wait()
        self.pause()

        question_text.generate_target()
        question_text.target.shift(2*DOWN + 2*LEFT)
        self.play(MoveToTarget(question_text))
        self.wait()
        self.pause()



        # add tagging
        person_labels = []
        for entity in sentence.to_dict('ner')['entities']:
          start_offset = example_text[0:entity['start_pos']].count(" ")
          end_offset = example_text[0:entity['end_pos']].count(" ")
          fixed_start_pos = entity['start_pos'] - start_offset
          fixed_end_pos = entity['end_pos'] - end_offset
          print(entity, fixed_start_pos, fixed_end_pos, example_text.replace(" ", "")[fixed_start_pos:fixed_end_pos])


          tag_type = entity['labels'][0].value
          if tag_type == "PER":
            person_framebox = BackgroundRectangle(text[fixed_start_pos:fixed_end_pos], buff=.05, color=GREEN)
            person_label = Text("PER", color=GREEN).next_to(person_framebox, DOWN)
            self.play(Create(person_framebox), Write(person_label))
            person_labels.append(person_label)
          elif tag_type == "LOC": 
            location_framebox = BackgroundRectangle(text[fixed_start_pos:fixed_end_pos], buff=.05, color=RED)
            location_label = Text("LOC", color=RED).next_to(location_framebox, DOWN)
            self.play(Create(location_framebox), Write(location_label))
          self.wait()
          self.pause()


        person_count_text = Text(str(len(person_labels)), color=GREEN).next_to(question_text, RIGHT)
        self.play(Transform(VGroup(*person_labels), person_count_text))
        self.wait()
        self.pause()



{'text': 'Max', 'start_pos': 0, 'end_pos': 3, 'labels': [PER (0.9975)]} 0 3 Max




{'text': 'Moritz', 'start_pos': 5, 'end_pos': 11, 'labels': [PER (0.9998)]} 4 10 Moritz




{'text': 'Anna', 'start_pos': 13, 'end_pos': 17, 'labels': [PER (0.9979)]} 11 15 Anna




{'text': 'Nele', 'start_pos': 22, 'end_pos': 26, 'labels': [PER (0.9995)]} 18 22 Nele




{'text': 'Köln', 'start_pos': 39, 'end_pos': 43, 'labels': [LOC (0.9946)]} 32 36 Köln




# Working Principle

In [236]:
%%manim -qm -v WARNING OldWorkingPrinciple

class OldWorkingPrinciple(Slide):

    def add_title(self):
        title = Text("Funktionsweise", gradient=(BLUE, BLUE_D), should_center=True).scale(1.5).to_edge(UP)
        self.add(title)
    
    def construct(self):
        global text_color

        self.add_title()
        self.wait()
        self.pause()
        
        # Dictionary
        dictionary = Text("Dictionary / Lexikon", color=text_color).shift(1.5*UP)
        self.play(Write(dictionary))
        self.wait()
        self.pause()

        dictionary_example_persons = Text('PER = {"Max", "Moritz", "Anna", "Nele"}', color=GRAY).scale(0.8).next_to(dictionary, DOWN)
        dictionary_example_locations = Text('LOC = {"Dortmund", "Köln", "Berlin"}', color=GRAY).scale(0.8).next_to(dictionary_example_persons, DOWN)
        self.play(Write(dictionary_example_persons))
        self.play(Write(dictionary_example_locations))

        dictionary_group = Group(dictionary, dictionary_example_persons, dictionary_example_locations)

        dictionary_group.generate_target()
        dictionary_group.target.shift(4*LEFT + UP)
        dictionary_group.target.scale(0.5)
        self.play(MoveToTarget(dictionary_group))
        self.wait()
        self.pause()

        # Handcraft features
        handcraft_features = Text("Handgefertigte Merkmale", color=text_color)
        self.play(Write(handcraft_features))
        self.wait()
        self.pause()

        handcraft_features_capitalizaiton = Text('Großschreibung', color=GRAY).scale(0.8).next_to(handcraft_features, DOWN)
        handcraft_features_length = Text('Wortlänge', color=GRAY).scale(0.8).next_to(handcraft_features_capitalizaiton, DOWN)
        handcraft_features_alphabet = Text('Zeichensatz (Buchstaben, Zahlen...)', color=GRAY).scale(0.8).next_to(handcraft_features_length, DOWN)
        self.play(Write(handcraft_features_capitalizaiton))
        self.play(Write(handcraft_features_length))
        self.play(Write(handcraft_features_alphabet))



In [None]:
%%manim -qm -v WARNING NewWorkingPrinciple

class NewWorkingPrinciple(Slide):

    def add_title(self):
        title = Text("Funktionsweise", gradient=(BLUE, BLUE_D), should_center=True).scale(1.5).to_edge(UP)
        self.add(title)
    
    def construct(self):
        global text_color

        self.add_title()
        self.wait()
        self.pause()
        
        # NN
        text_nn = Text("Neural Networks", color=text_color)
        self.play(Write(text_nn))
        self.wait()
        self.pause()

        # RNN
        text_rnn = Text("Recurrent", color=text_color).next_to(text_nn, LEFT)
        self.play(Write(text_rnn))
        self.wait()
        self.pause()

        text_rrn = VGroup(text_nn, text_rnn)
        self.play(text_rrn.animate.shift(2*UP))


        # 



# Word Embedding

In [230]:
%%manim -qm -v WARNING AscendingIndexWordEmbedding

class AscendingIndexWordEmbedding(Slide):

    def add_title(self):
        title = Text("Naive Word Embedding", gradient=(BLUE, BLUE_D), should_center=True).scale(1.5).to_edge(UP)
        self.add(title)
    
    def construct(self):
        global text_color

        self.add_title()
        self.wait()
        self.pause()
        
        text = Text("Wörter aufsteigend nummerieren", color=text_color).shift(2*UP)
        self.play(Write(text))
        self.wait()
        self.pause()

        # RNN
        words = Text('{"Moritz", "Köln", "Dortmund", "Anna"}').scale(0.8)
        self.play(Write(words))
        self.wait()
        self.pause()

        words_sorted = Text('["Anna", "Dortmund", "Köln", "Moritz"]').scale(0.8)
        self.play(TransformMatchingShapes(words, words_sorted, run_time=3, path_arc=PI / 2))
        self.wait()
        self.pause()

        words_sorted_backup = words_sorted.copy()
        words_sorted_backup.generate_target()
        words_sorted_backup.target.set_color(GRAY)
        words_sorted_backup.target.shift(UP)
        self.play(MoveToTarget(words_sorted_backup))
        self.wait()
        self.pause()

        words_sorted_with_idx = Text('[0,1,2,3]').scale(0.8)
        self.play(ReplacementTransform(words_sorted, words_sorted_with_idx))
        self.wait()
        self.pause()

        embedding_elements = VGroup(words_sorted_backup, words_sorted_with_idx)
        embedding = Paragraph('embedding = {\n  "Anna": 0, "Dortmund": 1, \n  "Köln": 2, "Moritz": 3\n}').scale(0.8)
        self.play(ReplacementTransform(embedding_elements, embedding))
        self.wait()
        self.pause()


        # show problem with ascending numbering
        problem_text1 = Text('dist("Anna", "Dortmund") = 1').shift(2*DOWN).scale(0.8)
        problem_text2 = Text('dist("Anna", "Moritz") = 3').next_to(problem_text1, DOWN).scale(0.8)

        self.play(Write(problem_text1))
        self.wait()
        self.pause()

        self.play(Write(problem_text2))
        self.wait()
        self.pause()



In [229]:
%%manim -qm -v WARNING OneHotWordEmbedding

class OneHotWordEmbedding(Slide):

    def add_title(self):
        title = Text("One-hot Word Embedding", gradient=(BLUE, BLUE_D), should_center=True).scale(1.5).to_edge(UP)
        self.add(title)
    
    def construct(self):
        global text_color

        self.add_title()
        self.wait()
        self.pause()
        
        text = Tex("Basisvektoren im $\\mathbb{R}^n$", color=text_color).shift(2*UP)
        self.play(Write(text))
        self.wait()
        self.pause()

        moritz = Tex("Moritz = $\\left(\\begin{array}{c} 1 \\\\ 0 \\\\ 0 \\end{array}\\right)$").shift(4*LEFT)
        self.play(Write(moritz))

        koeln = Tex("K\\\"oln = $\\left(\\begin{array}{c} 0 \\\\ 1 \\\\ 0 \\end{array}\\right)$")
        self.play(Write(koeln))

        dortmund = Tex("Dortmund = $\\left(\\begin{array}{c} 0 \\\\ 0 \\\\ 1 \\end{array}\\right)$").shift(4.5*RIGHT)
        self.play(Write(dortmund))


        # show problem with ascending numbering
        problem_text1 = Text('dist("Moritz", "Köln") = dist("Moritz", "Dortmund")').to_edge(DOWN).scale(0.8)
        self.play(Write(problem_text1))
        self.wait()
        self.pause()



In [199]:
%%manim -qm -v WARNING OneHotWordEmbeddingPlot

class OneHotWordEmbeddingPlot(ThreeDScene):
    def construct(self):
        axes = ThreeDAxes(x_range=[-1.5,1.5,1], y_range=[-1.5,1.5,1], z_range=[-1.5,1.5,1], x_length=9, y_length=9, z_length=6)
        self.set_camera_orientation(phi=75 * DEGREES, theta=45 * DEGREES)
        text3d = Text("One-hot Word Embedding", gradient=(BLUE, BLUE_D)).scale(0.8)
        self.add_fixed_in_frame_mobjects(text3d)
        text3d.to_corner(UL)
        self.add(axes)
        self.wait()


        vec1 = Vector([3,0,0])
        vec2 = Vector([0,3,0])
        vec3 = Vector([0,0,2])

        self.play(Create(vec1))
        moritz = Text("Moritz").scale(0.5).move_to(2.5*LEFT)
        self.add_fixed_in_frame_mobjects(moritz)
        self.play(Create(moritz))
        self.wait()

        self.play(Create(vec2))
        koeln = Text("Köln").scale(0.5).move_to(2.5*RIGHT)
        self.add_fixed_in_frame_mobjects(koeln)
        self.play(Write(koeln))
        self.wait()

        self.play(Create(vec3))
        dortmund = Text("Dortmund").scale(0.5).move_to(2*UP+RIGHT)
        self.add_fixed_in_frame_mobjects(dortmund)
        self.play(Create(dortmund))
        self.wait()

        
        mk_line = Line(vec1.get_end(), vec2.get_end(), color=RED)
        self.play(Create(mk_line))
        self.wait()

        md_line = Line(vec1.get_end(), vec3.get_end(), color=RED)
        self.play(ReplacementTransform(mk_line, md_line))
        self.wait()



In [228]:
%%manim -qm -v WARNING WordEmbedding

class WordEmbedding(Scene):
    def add_title(self):
        title = Text("Semantical Word Embedding", gradient=(BLUE, BLUE_D), should_center=True).scale(1.3).to_edge(UP)
        self.add(title)

    def construct(self):
        self.add_title()

        numberplane = NumberPlane().shift(2*DOWN) # y_length=5
        self.add(numberplane)

        arrow = Arrow(2*DOWN, [2, 1, 0], buff=0)
        tip_text = Text('Dortmund').next_to(arrow.get_end(), RIGHT)
        self.play(Create(arrow), Write(tip_text))

        arrow = Arrow(2*DOWN, [2, 0, 0], buff=0)
        tip_text = Text('Köln').next_to(arrow.get_end(), RIGHT)
        self.play(Create(arrow), Write(tip_text))


        arrow = Arrow(2*DOWN, [-3, 0, 0], buff=0)
        tip_text = Text('Moritz').next_to(arrow.get_end(), LEFT)
        self.play(Create(arrow), Write(tip_text))

        arrow = Arrow(2*DOWN, [-4, -1, 0], buff=0)
        tip_text = Text('Anna').next_to(arrow.get_end(), LEFT)
        self.play(Create(arrow), Write(tip_text))

