https://stanfordnlp.github.io/CoreNLP/memory-time.html
https://medium.com/sicara/train-ner-model-with-nltk-stanford-tagger-english-french-german-6d90573a9486

# Load Stanford NER tagger

In [20]:
import nltk
from nltk.tag.stanford import StanfordNERTagger

sentence = u"Twenty miles east of Reno, Nev., " \
    "where packs of wild mustangs roam free through " \
    "the parched landscape, Tesla Gigafactory 1 " \
    "sprawls near Interstate 80."

jar = './stanford-ner-tagger/stanford-ner.jar'
model = './stanford-ner-tagger/ner-model-english.ser.gz'

# Prepare NER tagger with english model
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

# Tokenize: Split sentence into words
words = nltk.word_tokenize(sentence)

# Run NER tagger on words
print(ner_tagger.tag(words))

[('Twenty', 'O'), ('miles', 'O'), ('east', 'O'), ('of', 'O'), ('Reno', 'LOCATION'), (',', 'O'), ('Nev.', 'LOCATION'), (',', 'O'), ('where', 'O'), ('packs', 'O'), ('of', 'O'), ('wild', 'O'), ('mustangs', 'O'), ('roam', 'O'), ('free', 'O'), ('through', 'O'), ('the', 'O'), ('parched', 'O'), ('landscape', 'O'), (',', 'O'), ('Tesla', 'ORGANIZATION'), ('Gigafactory', 'ORGANIZATION'), ('1', 'O'), ('sprawls', 'O'), ('near', 'O'), ('Interstate', 'LOCATION'), ('80', 'LOCATION'), ('.', 'O')]


# Train model

1) Load train dataset

In [28]:
import spacy 
# !python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm")
import import_ipynb
from Jsonl_to_IOB import *
path = 'all.jsonl'
data = get_data(path)
tokenized = tokenized_output(data)
tokenized.head()

In [30]:
tokenized.head()

Unnamed: 0,doc_id,para_id,tokens,ner_tags
0,0,0,"[Maßnahmenbekanntgabe, zu, MA, 40, ,, Prüfung,...","[O, O, B-ORG, I-ORG, O, O, O, O]"
1,0,1,[INHALTSVERZEICHNIS],[O]
2,0,2,[ABKÜRZUNGSVERZEICHNIS],[O]
3,0,3,"[bzw., beziehungsweise, Nr., Nummer]","[O, O, O, O]"
4,0,4,"[Erledigung, des, Prüfungsberichtes, Der, Stad...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O..."


In [37]:
for i in range(len(tokenized[:4])):
    for j in range(len(tokenized['tokens'][i])):
        print(tokenized['tokens'][i][j],"\t",tokenized['ner_tags'][i][j],sep = "")
    print("")

Maßnahmenbekanntgabe	O
zu	O
MA	B-ORG
40	I-ORG
,	O
Prüfung	O
der	O
Nebenbeschäftigungen	O

INHALTSVERZEICHNIS	O

ABKÜRZUNGSVERZEICHNIS	O

bzw.	O
beziehungsweise	O
Nr.	O
Nummer	O



In [43]:
with open("train.tsv", "w", encoding = 'utf-8') as record_file:
    for i in range(len(tokenized)):
        for j in range(len(tokenized['tokens'][i])):
#             print(tokenized['tokens'][i][j],"\t",tokenized['ner_tags'][i][j],sep = "")
            record_file.write(tokenized['tokens'][i][j]+"\t"+tokenized['ner_tags'][i][j]+"\n")
        record_file.write("\n")

In [59]:
!cd stanford-ner-tagger/
# !java -cp "stanford-ner.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop prop.txt
# With the -cp argument you provide the classpath i.e. path(s) to additional classes or libraries
# that your program may require when being compiled or run

In [75]:
!java -mx4g -cp "./stanford-ner-tagger/stanford-ner.jar" edu.stanford.nlp.ie.crf.CRFClassifier -prop stanford-ner-tagger/prop.txt

Invoked on Sun Oct 09 21:14:10 CEST 2022 with arguments: -prop stanford-ner-tagger/prop.txt
usePrevSequences=true
useClassFeature=true
useTypeSeqs2=true
useSequences=true
wordShape=chris2useLC
useTypeySequences=true
useDisjunctive=true
noMidNGrams=true
serializeTo=dummy-ner-model-german.ser.gz
maxNGramLeng=6
useNGrams=true
usePrev=true
useNext=true
maxLeft=1
trainFile=stanford-ner-tagger/train.tsv
map=word=0,answer=1
useWord=true
useTypeSeqs=true
numFeatures = 314356
Time to convert docs to feature indices: 4.0 seconds
Current memory used: 344m
numClasses: 3 [0=O,1=B-ORG,2=I-ORG]
numDocuments: 3705
numDatums: 182818
numFeatures: 314356
Time to convert docs to data/labels: 3.2 seconds
Current memory used: 370m
Running gradient on 16 threads
numWeights: 1666554
QNMinimizer called on double function of 1666554 variables, using M = 25.
               An explanation of the output:
Iter           The number of iterations
evals          The number of function evaluations
SCALING        <D> Di

QNMinimizer terminated due to average improvement: | newest_val - previous_val | / |newestVal| < TOL 
Total time spent in optimization: 97.58s
CRFClassifier training ... done [106.7 sec].
Serializing classifier to dummy-ner-model-german.ser.gz... done.


In [77]:
# coding: utf-8

import nltk
from nltk.tag.stanford import StanfordNERTagger

sentence = "Erledigung des Prüfungsberichtes Der Stadtrechnungshof Wien unterzog die Gebarung der Vienna Film Commission GmbH in den Jahren 2015 bis 2017 einer Prüfung. Der diesbezügliche Bericht des Stadtrechnungshofes Wien wurde am 8. Oktober 2019 veröffentlicht und im Rahmen der Sitzung des Stadtrechnungshofausschusses vom 16. Oktober 2019, Ausschusszahl 63/19 mit Beschluss zur Kenntnis genommen."
jar = './stanford-ner-tagger/stanford-ner.jar'
model = 'dummy-ner-model-german.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

words = nltk.word_tokenize(sentence)
print(ner_tagger.tag(words))

[('Erledigung', 'O'), ('des', 'O'), ('Prüfungsberichtes', 'O'), ('Der', 'O'), ('Stadtrechnungshof', 'B-ORG'), ('Wien', 'I-ORG'), ('unterzog', 'O'), ('die', 'O'), ('Gebarung', 'O'), ('der', 'O'), ('Vienna', 'O'), ('Film', 'O'), ('Commission', 'O'), ('GmbH', 'O'), ('in', 'O'), ('den', 'O'), ('Jahren', 'O'), ('2015', 'O'), ('bis', 'O'), ('2017', 'O'), ('einer', 'O'), ('Prüfung', 'O'), ('.', 'O'), ('Der', 'O'), ('diesbezügliche', 'O'), ('Bericht', 'O'), ('des', 'O'), ('Stadtrechnungshofes', 'B-ORG'), ('Wien', 'I-ORG'), ('wurde', 'O'), ('am', 'O'), ('8', 'O'), ('.', 'O'), ('Oktober', 'O'), ('2019', 'O'), ('veröffentlicht', 'O'), ('und', 'O'), ('im', 'O'), ('Rahmen', 'O'), ('der', 'O'), ('Sitzung', 'O'), ('des', 'O'), ('Stadtrechnungshofausschusses', 'B-ORG'), ('vom', 'O'), ('16', 'O'), ('.', 'O'), ('Oktober', 'O'), ('2019', 'O'), (',', 'O'), ('Ausschusszahl', 'O'), ('6319', 'O'), ('mit', 'O'), ('Beschluss', 'O'), ('zur', 'O'), ('Kenntnis', 'O'), ('genommen', 'O'), ('.', 'O')]
