# 01

In [6]:
import pandas as pd

df = (pd.read_csv("Questions.csv", nrows = 1_000_000,
                 encoding = "ISO-8859-1", usecols = ['Title', 'Id']))
titles = [_  for _ in df['Title'] ]

In [2]:
def has_golang(text):
    return " go " in text

g = (title for title in titles if has_golang(title))
[next(g) for i in range(2)]

['Where does Console.WriteLine go in ASP.NET?',
 'Should try...catch go inside or outside a loop?']

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.1/12.8 MB 491.5 kB/s eta 0:00:26
      --------------------------------------- 0.3/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.6/12.8 MB 3.0 MB/s eta 0:00:05
     ----- ---------------------------------- 1.9/12.8 MB 7.1 MB/s eta 0:00:02
     -------- ------------------------------- 2.8/12.8 MB 8.8 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 8.6 MB/s eta 0:00:02
     ------------ --------------------------- 4.0/12.8 MB 10.2 MB/s eta 0:00:01
     -------------- -------------------------

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [5]:
[t for t in nlp("My name is Rishit.")]

[My, name, is, Rishit, .]

In [6]:
doc = nlp("My name is Rishit.")
t = doc[0]
type(t)

spacy.tokens.token.Token

In [17]:
from spacy import displacy

displacy.render(doc)

In [8]:
spacy.explain("poss")

'possession modifier'

In [9]:
for t in nlp('Where does Console.WriteLine go in ASP.NET?'):
    print(t, t.pos_, t.dep_)

Where SCONJ advmod
does VERB ROOT
Console PROPN nsubj
. PUNCT punct
WriteLine PROPN nsubj
go VERB ROOT
in ADP prep
ASP.NET PROPN pobj
? PUNCT punct


## Detecting Golang

In [10]:
df = (pd.read_csv("Questions.csv", nrows = 2_000_000,
                 encoding = "ISO-8859-1", usecols = ['Title', 'Id']))

titles = [_ for _ in df.loc[lambda d: d['Title'].str.lower().str.contains("go")]['Title']]

In [11]:
nlp = spacy.load("en_core_web_sm", disable = ['ner'])

In [12]:
%%time

def has_golang(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                if t.dep_ == "pobj":
                    return True
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range(10)]

CPU times: total: 3.98 s
Wall time: 4.02 s


[How do I disable multiple listboxes in one go using jQuery?,
 Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go,
 Trouble reading from a socket in go,
 Convert string to integer type in Go?,
 Implementing the âdeferâ statement from Go in Objective-C?,
 what's the state of go language IDE support?]

In [13]:
displacy.render(nlp('Removing all event handlers in one go'))

In [14]:
spacy.explain('csubj')

'clausal subject'

## Iteration

In [15]:
df_tags = pd.read_csv('Tags.csv')
go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            return True
    return False

all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].to_list()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (df
                 .loc[lambda d: ~d['Id'].isin(go_ids)]
                 .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                 ['Title']
                 .to_list())

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

(1858, 1208, 1696)

In [16]:
model_name = 'en_core_web_sm'
model = spacy.load(model_name, disable = ['ner'])

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                return True
    return False

method = "not-verb"

correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct / (correct + wrong)
recall = correct / len(detectable)
accuracy = (correct + len(non_detectable) - wrong) / (len(detectable) + len(non_detectable))

f"{precision}, {recall}, {accuracy}, {model_name}, {method}"

'0.9098922624877571, 0.7690397350993378, 0.8722451790633609, en_core_web_sm, not-verb'

# 02 Detecting more languages

In [17]:
import spacy
import pandas as pd
from spacy import displacy

In [18]:
nlp = spacy.load('en_core_web_sm')

In [19]:
def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang', 'python', 'ruby', 'objective-c']:
            if t.pos_ != 'VERB':
                return True
    return False

In [20]:
doc = nlp('i like to program in objective-c')
has_go_token(doc)

False

In [21]:
[t for t in doc]

[i, like, to, program, in, objective, -, c]

In [22]:
obj_c_pattern = [{'LOWER': 'objective'}, {'IS_PUNCT': True}, {'LOWER': 'c'}]

golang_pattern = [{'LOWER': {'IN': ['go', 'golang']}, 'POS':{'NOT_IN': ['VERB']}}]

In [23]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
matcher.add("OBJ_C_LANG", [obj_c_pattern])
matcher.add("GOLANG_LANG", [golang_pattern])

In [24]:
doc = nlp("I code both in golang as well as objective-c")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

golang
objective-c


In [25]:
doc[5:8]

as well as

In [26]:
# cleaned-up-code

In [27]:
from spacy.matcher import Matcher

obj_c_pattern_1 = [{'LOWER': 'objective'},
                   {'IS_PUNCT': True, 'OP': '?'},
                   {'LOWER': 'c'}]
obj_c_pattern_2 = [{'LOWER': 'objectivec'}]

golang_pattern_1 = [{'LOWER': 'golang'}]
golang_pattern_2 = [{'LOWER': 'go',
                    'POS': {'NOT_IN': ['VERB']}}]

python_pattern = [{'LOWER': 'python'}]
ruby_pattern = [{'LOWER': 'ruby'}]
js_pattern = [{'LOWER': {'IN': ['js', 'javascript']}}]

matcher = Matcher(nlp.vocab)
matcher.add("OBJ_C_LANG", [obj_c_pattern_1, obj_c_pattern_2])
matcher.add("GOLANG_LANG", [golang_pattern_1, golang_pattern_2])
matcher.add("PYTHON_LANG", [python_pattern])
matcher.add("JS_LANG", [js_pattern])
matcher.add("RUBY_LANG", [ruby_pattern])

In [28]:
doc = nlp("I code in both python, go/golang as well as objective-c")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

python
golang
objective-c


In [29]:
doc = nlp("I've done some js and ruby and go programming")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

js
ruby


In [30]:
[(t, t.pos_) for t in doc]

[(I, 'PRON'),
 ('ve, 'AUX'),
 (done, 'VERB'),
 (some, 'DET'),
 (js, 'ADJ'),
 (and, 'CCONJ'),
 (ruby, 'PROPN'),
 (and, 'CCONJ'),
 (go, 'VERB'),
 (programming, 'NOUN')]

## Benchmarking

In [31]:
import pandas as pd

df = (pd.read_csv("Questions.csv", nrows = 1_000_000,
                 encoding = 'ISO-8859-1', usecols = ['Title', 'Id']))

In [32]:
titles = (_ for _ in df['Title'] if "objective" in _.lower())

In [33]:
for i in range(200):
    doc = nlp(next(titles))
    if len(matcher(doc)) == 0:
        print(doc)

Having to set objectives for developers, even though objectives don't work
How can i connect MySQL database with objective project?
Downloading multiple files in iphone app(Objective c)
Including Objective C++ Type in C++ Class Definition
Storing UIImages with ObjectiveRecord and ObjectiveSync
__OBJC__ equivalent for Objective-C++
iPhone Device/Simulator memory oddities using Objective-C++
How well is Objective-J documented? Is the documentation good enough to start using it seriously?


# 03 

In [34]:
titles = (_ for _ in df['Title'])
g = (d for d in nlp.pipe(titles) if len(matcher(d)) > 0)
for i in range(10):
    print(next(g))

How do I run Rake tasks within a Ruby script?
How can I create Prototype Methods (like JavaScript) in C#.Net?
Some kind of task manager for JavaScript in Firefox 3?
Best practices for managing and deploying large JavaScript apps
Create an encrypted ZIP file in Python
Code to ask yes/no question in javascript
Executing JavaScript from Flex: Is this javascript function dangerous?
What Javascript rich text editor will not break the browser's spellcheck?
How do I marshal a lambda (Proc) in Ruby?
Ruby Performance


### Extending Jupyter

In [13]:
from IPython.display import HTML as html_print

def style(s, bold = False):
    blob = f"<text>{s}<text>"
    if bold:
        blob = f"<b style='backgorund-color: #fff59d'>{blob}</b>"
    return blob

def html_generator(g, n = 10):
    blob = ""
    for i in range(n):
        doc = next(g)
        
        state = [[t, False] for t in doc]
        for idx, start, end in matcher(doc):
            for i in range(start, end):
                state[i][1] = True
        blob += style(' '.join([style(str(t[0]), bold = t[1]) for t in state]) + '<br>')
    return blob

In [36]:
titles = (_ for _ in df['Title'])
g = (d for d in nlp.pipe(titles) if len(matcher(d)) > 0)
html_print(html_generator(g, n = 10))

### Adding more languages

In [3]:
import spacy 
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

In [4]:
obj_c_pattern1 = [{'LOWER': 'objective'},
                  {'IS_PUNCT': True, 'OP': '?'},
                  {'LOWER': 'c'}]
obj_c_pattern2 = [{'LOWER': 'objectivec'}]

csharp_pattern1 = [{'LOWER': 'c'}, {'LOWER': '#'}]
csharp_pattern2 = [{'LOWER': 'c'}, {'LOWER': 'sharp'}]
csharp_pattern3 = [{'LOWER': 'c#'}]

fsharp_pattern1 = [{'LOWER': 'f'}, {'LOWER': '#'}]
fsharp_pattern2 = [{'LOWER': 'f'}, {'LOWER': 'sharp'}]
fsharp_pattern3 = [{'LOWER': 'f#'}]
 
dot_net_pattern = [{'LOWER': '.net'}]

php_pattern = [{'LOWER': 'php'}]

asp_net_pattern = [{'LOWER': {'IN': ['asp.net', 'asp']}}]

python_pattern = [{'LOWER': 'python'}]

lisp_pattern1  = [{'LOWER': 'lisp'}]
lisp_pattern2  = [{'LOWER': 'common'}, {'LOWER': 'lisp'}]

go_pattern1    = [{'LOWER': 'go', 'POS': {'NOT_IN': ['VERB']}}]
go_pattern2    = [{'LOWER': 'golang'}]

ruby_pattern   = [{'LOWER': 'ruby'}]

sql_pattern    = [{'LOWER': 'sql'}]

matlab_pattern = [{'LOWER': 'matlab'}]

perl_pattern   = [{'LOWER': 'perl'}]

html_pattern   = [{'LOWER': 'html'}]

css_pattern   = [{'LOWER': 'css'}]

js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

java_pattern   = [{'LOWER': 'java'}]

c_pattern      = [{'LOWER': 'c'}]

cpp_pattern    = [{'LOWER': 'c++'}]

matcher = Matcher(nlp.vocab)
matcher.add("OBJ_C_LANG", [obj_c_pattern1, obj_c_pattern2])
matcher.add("CSHARP_LANG", [csharp_pattern1, csharp_pattern2, csharp_pattern3])
matcher.add("FSHARP_LANG", [fsharp_pattern1, fsharp_pattern2, fsharp_pattern3])
matcher.add("DOT_NET_LANG", [dot_net_pattern])
matcher.add("PHP_LANG", [php_pattern])
matcher.add("ASP_NET_LANG", [asp_net_pattern])
matcher.add("PYTHON_LANG", [python_pattern])
matcher.add("LISP_LANG", [lisp_pattern1, lisp_pattern2])
matcher.add("GO_LANG", [go_pattern1, go_pattern2])
matcher.add("RUBY_LANG", [ruby_pattern])
matcher.add("SQL_LANG", [sql_pattern])
matcher.add("MATLAB_LANG", [matlab_pattern])
matcher.add("PERL_LANG", [perl_pattern])
matcher.add("HTML_LANG", [html_pattern])
matcher.add("CSS_LANG", [css_pattern])
matcher.add("JS_LANG", [js_pattern])
matcher.add("JAVA_LANG", [java_pattern])
matcher.add("C_LANG", [c_pattern])
matcher.add("CPP_LANG", [cpp_pattern])

## Actual labelling

In [7]:
titles = (_ for _ in df['Title'][:2000])
sum(1 for d in nlp.pipe(titles) if len(matcher(d)) > 0)

647

In [8]:
df.head()

Unnamed: 0,Id,Title
0,80,SQLStatement.execute() - multiple queries in o...
1,90,Good branching and merging tutorials for Torto...
2,120,ASP.NET Site Maps
3,180,Function for creating color wheels
4,260,Adding scripting functionality to .NET applica...


## Language Version

In [9]:
def create_versioned(name):
    return [
        [{'LOWER': name}],
        [{'LOWER': {'REGEX': f'({name}\d+\.?\d*.?\d*)'}}],
        [{'LOWER': name}, {'TEXT': {'REGEX': '(\d+\.?\d*.?\d*)'}}],
    ]

create_versioned('python')

[[{'LOWER': 'python'}],
 [{'LOWER': {'REGEX': '(python\\d+\\.?\\d*.?\\d*)'}}],
 [{'LOWER': 'python'}, {'TEXT': {'REGEX': '(\\d+\\.?\\d*.?\\d*)'}}]]

In [10]:
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("PYTHON_LANG", [*create_versioned('python')])

for doc in nlp.pipe(['i use python, python3.7, python 3.6.6',
                     "also python3",
                     "I love dogs"]):
    matches = matcher(doc)
    
    if matches:
        print("Matches found in document:", doc.text)
        for match_id, start, end in matches:
            span = doc[start:end]
            print("Match:", span.text)
    else:
        print("No matches found in document:", doc.text)

Matches found in document: i use python, python3.7, python 3.6.6
Match: python
Match: python3.7
Match: python
Match: python 3.6.6
Matches found in document: also python3
Match: python3
No matches found in document: I love dogs


## Putting this in a matcher

In [11]:
obj_c_pattern1 = [{'LOWER': 'objective'},
                  {'IS_PUNCT': True, 'OP': '?'},
                  {'LOWER': 'c'}]
obj_c_pattern2 = [{'LOWER': 'objectivec'}]

csharp_pattern1 = [{'LOWER': 'c'}, {'LOWER': '#'}]
csharp_pattern2 = [{'LOWER': 'c'}, {'LOWER': 'sharp'}]
csharp_pattern3 = [{'LOWER': 'c#'}]

fsharp_pattern1 = [{'LOWER': 'f'}, {'LOWER': '#'}]
fsharp_pattern2 = [{'LOWER': 'f'}, {'LOWER': 'sharp'}]
fsharp_pattern3 = [{'LOWER': 'f#'}]

lisp_pattern1  = [{'LOWER': 'lisp'}]
lisp_pattern2  = [{'LOWER': 'common'}, {'LOWER': 'lisp'}]

go_pattern1    = [{'LOWER': 'go', 'POS': {'NOT_IN': ['VERB']}}]
go_pattern2    = [{'LOWER': 'golang'}]

html_pattern   = [{'LOWER': 'html'}]
css_pattern    = [{'LOWER': 'css'}]
sql_pattern    = [{'LOWER': 'sql'}]
js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

cpp_pattern    = [{'LOWER': 'c++'}]


versioned_languages = ['ruby', 'php', 'python', 'perl', 'java', 'haskell', 
                       'scala', 'c', 'cpp', 'matlab', 'bash', 'delphi']
flatten = lambda l: [item for sublist in l for item in sublist]
versioned_patterns = flatten([create_versioned(lang) for lang in versioned_languages])

matcher = Matcher(nlp.vocab, validate=True)
matcher.add("PROG_LANG", 
            [obj_c_pattern1, obj_c_pattern2,
            go_pattern1, go_pattern2,
            lisp_pattern1, lisp_pattern2,
            csharp_pattern1, csharp_pattern2, csharp_pattern3,
            fsharp_pattern1, fsharp_pattern2, fsharp_pattern3,
            html_pattern, css_pattern, sql_pattern, js_pattern,
            cpp_pattern, *versioned_patterns])

In [14]:
titles = (_ for _ in df['Title'])
g = (d for d in nlp.pipe(titles) if len(matcher(d)) > 0)
html_print(html_generator(g, n = 60))

 ## Making a statistical model

In [86]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     --------------------------------------- 0.0/42.8 MB 325.1 kB/s eta 0:02:12
     --------------------------------------- 0.1/42.8 MB 465.5 kB/s eta 0:01:32
     ---------------------------------------- 0.3/42.8 MB 1.8 MB/s eta 0:00:24
     - -------------------------------------- 1.1/42.8 MB 5.0 MB/s eta 0:00:09
     - -------------------------------------- 1.9/42.8 MB 7.1 MB/s eta 0:00:06
     -- ------------------------------------- 2.7/42.8 MB 9.2 MB/s eta 0:00:05
     --- ------------------------------------ 3.7/42.8 MB 10.8 MB/s eta 0:00:04
     ---- ----------------------------------- 4.8/42

In [15]:
nlp = spacy.load("en_core_web_md")
doc = nlp("My name is Rishit and I was born on 10th November 2002. \
I work at Google from Hyderabad. I just brought a guitar \
cost $1000 on amazon and I will get is services here for 20 Euro a year.")

In [18]:
displacy.render(doc, style = 'ent')

In [19]:
[(e, type(e)) for e in doc.ents]

[(Rishit, spacy.tokens.span.Span),
 (10th November 2002, spacy.tokens.span.Span),
 (Google, spacy.tokens.span.Span),
 (Hyderabad, spacy.tokens.span.Span),
 (1000, spacy.tokens.span.Span),
 (amazon, spacy.tokens.span.Span),
 (20 Euro, spacy.tokens.span.Span)]

In [20]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x208508fb470>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x208508fae70>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2085a6ba5e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x20850f821d0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x20850f7d850>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2085a6ba420>)]

In [21]:
doc = nlp("I do code with  datastuff using python and golang .")

for idx, start, end in matcher(doc):
    print(doc[start: end])

python
golang


In [22]:
def parse_train_data(text):
    doc = nlp(text)
    detections = [(doc[start:end].start_char, doc[start:end].end_char, "PROLANG") for idx, start, end in matcher(doc)]
    return (doc.text, {'entities': detections})

parse_train_data("I like python, javascript and golang")

('I like python, javascript and golang',
 {'entities': [(7, 13, 'PROLANG'), (15, 25, 'PROLANG'), (30, 36, 'PROLANG')]})

In [23]:
titles = (_ for _ in df['Title'][:10000])

TRAIN_DATA = [parse_train_data(d) for d in nlp.pipe(titles) if len(matcher(d)) == 1]


In [24]:
TRAIN_DATA[5:15]

[('How to set up unit testing for Visual Studio C++',
  {'entities': [(45, 48, 'PROLANG')]}),
 ('How do you pack a visual studio c++ project for release?',
  {'entities': [(32, 35, 'PROLANG')]}),
 ('How do you get leading wildcard full-text searches to work in SQL Server?',
  {'entities': [(62, 65, 'PROLANG')]}),
 ('How do I Transform Sql Columns into Rows?',
  {'entities': [(19, 22, 'PROLANG')]}),
 ('How do I run Rake tasks within a Ruby script?',
  {'entities': [(33, 37, 'PROLANG')]}),
 ('What code analysis tools do you use for your Java projects?',
  {'entities': [(45, 49, 'PROLANG')]}),
 ('What program can I use to generate diagrams of SQL view/table structure?',
  {'entities': [(47, 50, 'PROLANG')]}),
 ('How to easily consume a web service from PHP',
  {'entities': [(41, 44, 'PROLANG')]}),
 ('How can I create Prototype Methods (like JavaScript) in C#.Net?',
  {'entities': [(41, 51, 'PROLANG')]}),
 ('How can I Java webstart multiple, dependent, native libraries?',
  {'entities': [(

## Training loop

In [25]:
def create_blank_nlp(train_data):
    nlp = spacy.blank("en")
    nlp.add_pipe("ner", last = True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp

In [26]:
import random
import datetime as dt
from spacy.training import Example

nlp = create_blank_nlp(TRAIN_DATA)
optimizer = nlp.initialize()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], sgd = optimizer, losses = losses)
    print(f"Losses at iteration {i}- ", losses)

Losses at iteration 0-  {'ner': 225.44078492667134}
Losses at iteration 1-  {'ner': 26.90905310859491}
Losses at iteration 2-  {'ner': 45.061665136676815}
Losses at iteration 3-  {'ner': 13.228478982850836}
Losses at iteration 4-  {'ner': 16.677624235369365}
Losses at iteration 5-  {'ner': 13.19473227026368}
Losses at iteration 6-  {'ner': 20.888658578322666}
Losses at iteration 7-  {'ner': 3.8586864287940363}
Losses at iteration 8-  {'ner': 14.589132309745972}
Losses at iteration 9-  {'ner': 5.250699636797989e-09}
Losses at iteration 10-  {'ner': 3.633097586178621e-08}
Losses at iteration 11-  {'ner': 6.80208205813465e-09}
Losses at iteration 12-  {'ner': 2.924298892714287e-11}
Losses at iteration 13-  {'ner': 5.5463293470254855e-12}
Losses at iteration 14-  {'ner': 8.552667547657781e-09}
Losses at iteration 15-  {'ner': 1.0505773952379361e-13}
Losses at iteration 16-  {'ner': 2.1166919606623644e-13}
Losses at iteration 17-  {'ner': 7.189936574291887e-14}
Losses at iteration 18-  {'ne

## Improvements

In [27]:
from spacy.util import minibatch, compounding

In [28]:
nlp = create_blank_nlp(TRAIN_DATA)
optimizer = nlp.initialize()

for i in range(20):
    losses = {}
    batches = minibatchtch(TRAIN_DATA, size = compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
        nlp.update(examples, drop=0.5, losses=losses)
    print(f"Losses at iteration {i}- ", losses)

Losses at iteration 0-  {'ner': 1111.5845977585554}
Losses at iteration 1-  {'ner': 97.80233677816638}
Losses at iteration 2-  {'ner': 58.44492731422252}
Losses at iteration 3-  {'ner': 40.42738138973332}
Losses at iteration 4-  {'ner': 19.45584807188985}
Losses at iteration 5-  {'ner': 22.611650307013107}
Losses at iteration 6-  {'ner': 17.237374274159958}
Losses at iteration 7-  {'ner': 15.693540128247996}
Losses at iteration 8-  {'ner': 14.820190926588745}
Losses at iteration 9-  {'ner': 11.505718989318787}
Losses at iteration 10-  {'ner': 13.805778245967415}
Losses at iteration 11-  {'ner': 7.2373613230684555}
Losses at iteration 12-  {'ner': 2.3013719148178358}
Losses at iteration 13-  {'ner': 12.920565128393418}
Losses at iteration 14-  {'ner': 23.70559080849362}
Losses at iteration 15-  {'ner': 16.806681036538933}
Losses at iteration 16-  {'ner': 0.003785490573719111}
Losses at iteration 17-  {'ner': 7.409474430642477}
Losses at iteration 18-  {'ner': 10.318084794713641}
Losses 

In [29]:
nlp.pipeline

[('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2086cc93e60>)]

In [31]:
doc = nlp("I write code in python and go")
displacy.render(doc, style = "ent")

## Entity Ruler

In [33]:
from spacy.lang.en import English
from spacy.pipeline import EntityRuler

In [50]:
def create_patterns():
    versioned_languages = ['ruby', 'php', 'python', 'perl', 'java', 'haskell', 
                           'scala', 'c', 'cpp', 'matlab', 'bash', 'delphi']
    flatten = lambda l: [item for sublist in l for item in sublist]
    versioned_patterns = flatten([create_versioned(lang) for lang in versioned_languages])

    lang_patterns = [
        [{'LOWER': 'objective'}, {'IS_PUNCT': True, 'OP': '?'},{'LOWER': 'c'}],
        [{'LOWER': 'objectivec'}],
        [{'LOWER': 'c'}, {'LOWER': '#'}],
        [{'LOWER': 'c'}, {'LOWER': 'sharp'}],
        [{'LOWER': 'c#'}],
        [{'LOWER': 'f'}, {'LOWER': '#'}],
        [{'LOWER': 'f'}, {'LOWER': 'sharp'}],
        [{'LOWER': 'f#'}],
        [{'LOWER': 'lisp'}],
        [{'LOWER': 'common'}, {'LOWER': 'lisp'}],
        [{'LOWER': 'go', 'POS': {'NOT_IN': ['VERB']}}],
        [{'LOWER': 'golang'}],
        [{'LOWER': 'html'}],
        [{'LOWER': 'css'}],
        [{'LOWER': 'sql'}],
        [{'LOWER': {'IN': ['js', 'javascript']}}],
        [{'LOWER': 'c++'}],
    ]

    return versioned_patterns + lang_patterns    

nlp = English()
ruler = EntityRuler(nlp)
nlp.add_pipe('entity_ruler').add_patterns([{'label': 'PROLANG', 'pattern': p} for p in create_patterns()])

doc = nlp("Golang is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Golang', 'PROLANG')]


()