# 01

In [1]:
import pandas as pd

df = (pd.read_csv("Questions.csv", nrows = 1_000_000,
                 encoding = "ISO-8859-1", usecols = ['Title', 'Id']))
titles = [_  for _ in df['Title'] ]

In [2]:
def has_golang(text):
    return " go " in text

g = (title for title in titles if has_golang(title))
[next(g) for i in range(2)]

['Where does Console.WriteLine go in ASP.NET?',
 'Should try...catch go inside or outside a loop?']

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
     ---------------------------------------- 0.1/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.5/12.8 MB 3.3 MB/s eta 0:00:04
     ---- ----------------------------------- 1.4/12.8 MB 7.4 MB/s eta 0:00:02
     ----------- ---------------------------- 3.8/12.8 MB 16.1 MB/s eta 0:00:01
     ----------------------- ---------------- 7.6/12.8 MB 27.2 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 81.8 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 81.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 59.4 MB/s eta 0:00:00
[38;5;2m[+] Download and installat

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [5]:
[t for t in nlp("My name is Rishit.")]

[My, name, is, Rishit, .]

In [6]:
doc = nlp("My name is Rishit.")
t = doc[0]
type(t)

spacy.tokens.token.Token

In [7]:
from spacy import displacy

displacy.render(doc)

In [8]:
spacy.explain("poss")

'possession modifier'

In [9]:
for t in nlp('Where does Console.WriteLine go in ASP.NET?'):
    print(t, t.pos_, t.dep_)

Where SCONJ advmod
does VERB ROOT
Console PROPN nsubj
. PUNCT punct
WriteLine PROPN nsubj
go VERB ROOT
in ADP prep
ASP.NET PROPN pobj
? PUNCT punct


## Detecting Golang

In [10]:
df = (pd.read_csv("Questions.csv", nrows = 2_000_000,
                 encoding = "ISO-8859-1", usecols = ['Title', 'Id']))

titles = [_ for _ in df.loc[lambda d: d['Title'].str.lower().str.contains("go")]['Title']]

In [11]:
nlp = spacy.load("en_core_web_sm", disable = ['ner'])

In [12]:
%%time

def has_golang(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                if t.dep_ == "pobj":
                    return True
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range(10)]

CPU times: total: 4.28 s
Wall time: 4.39 s


[How do I disable multiple listboxes in one go using jQuery?,
 Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go,
 Trouble reading from a socket in go,
 Convert string to integer type in Go?,
 Implementing the âdeferâ statement from Go in Objective-C?,
 what's the state of go language IDE support?]

In [13]:
displacy.render(nlp('Removing all event handlers in one go'))

In [14]:
spacy.explain('csubj')

'clausal subject'

## Iteration

In [15]:
df_tags = pd.read_csv('Tags.csv')
go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            return True
    return False

all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].to_list()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (df
                 .loc[lambda d: ~d['Id'].isin(go_ids)]
                 .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                 ['Title']
                 .to_list())

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

(1858, 1208, 1696)

In [18]:
model_name = 'en_core_web_sm'
model = spacy.load(model_name, disable = ['ner'])

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                return True
    return False

method = "not-verb"

correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct / (correct + wrong)
recall = correct / len(detectable)
accuracy = (correct + len(non_detectable) - wrong) / (len(detectable) + len(non_detectable))

f"{precision}, {recall}, {accuracy}, {model_name}, {method}"

'0.9098922624877571, 0.7690397350993378, 0.8722451790633609, en_core_web_sm, not-verb'

# 02 Detecting more languages

In [19]:
import spacy
import pandas as pd
from spacy import displacy

In [20]:
nlp = spacy.load('en_core_web_sm')

In [21]:
def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang', 'python', 'ruby', 'objective-c']:
            if t.pos_ != 'VERB':
                return True
    return False

In [23]:
doc = nlp('i like to program in objective-c')
has_go_token(doc)

False

In [24]:
[t for t in doc]

[i, like, to, program, in, objective, -, c]

In [35]:
obj_c_pattern = [{'LOWER': 'objective'}, {'IS_PUNCT': True}, {'LOWER': 'c'}]

golang_pattern = [{'LOWER': {'IN': ['go', 'golang']}, 'POS':{'NOT_IN': ['VERB']}}]

In [38]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
matcher.add("OBJ_C_LANG", [obj_c_pattern])
matcher.add("GOLANG_LANG", [golang_pattern])

In [39]:
doc = nlp("I code both in golang as well as objective-c")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

golang
objective-c


In [34]:
doc[5:8]

objective-c

In [40]:
# cleaned-up-code

In [53]:
from spacy.matcher import Matcher

obj_c_pattern_1 = [{'LOWER': 'objective'},
                   {'IS_PUNCT': True, 'OP': '?'},
                   {'LOWER': 'c'}]
obj_c_pattern_2 = [{'LOWER': 'objectivec'}]

golang_pattern_1 = [{'LOWER': 'golang'}]
golang_pattern_2 = [{'LOWER': 'go',
                    'POS': {'NOT_IN': ['VERB']}}]

python_pattern = [{'LOWER': 'python'}]
ruby_pattern = [{'LOWER': 'ruby'}]
js_pattern = [{'LOWER': {'IN': ['js', 'javascript']}}]

matcher = Matcher(nlp.vocab)
matcher.add("OBJ_C_LANG", [obj_c_pattern_1, obj_c_pattern_2])
matcher.add("GOLANG_LANG", [golang_pattern_1, golang_pattern_2])
matcher.add("PYTHON_LANG", [python_pattern])
matcher.add("JS_LANG", [js_pattern])
matcher.add("RUBY_LANG", [ruby_pattern])

In [54]:
doc = nlp("I code in both python, go/golang as well as objective-c")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

python
golang
objective-c


In [55]:
doc = nlp("I've done some js and ruby and go programming")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

js
ruby


In [56]:
[(t, t.pos_) for t in doc]

[(I, 'PRON'),
 ('ve, 'AUX'),
 (done, 'VERB'),
 (some, 'DET'),
 (js, 'ADJ'),
 (and, 'CCONJ'),
 (ruby, 'PROPN'),
 (and, 'CCONJ'),
 (go, 'VERB'),
 (programming, 'NOUN')]

## Benchmarking

In [57]:
import pandas as pd

df = (pd.read_csv("Questions.csv", nrows = 1_000_000,
                 encoding = 'ISO-8859-1', usecols = ['Title', 'Id']))

In [58]:
titles = (_ for _ in df['Title'] if "objective" in _.lower())

In [59]:
for i in range(200):
    doc = nlp(next(titles))
    if len(matcher(doc)) == 0:
        print(doc)

Having to set objectives for developers, even though objectives don't work
How can i connect MySQL database with objective project?
Downloading multiple files in iphone app(Objective c)
Including Objective C++ Type in C++ Class Definition
Storing UIImages with ObjectiveRecord and ObjectiveSync
__OBJC__ equivalent for Objective-C++
iPhone Device/Simulator memory oddities using Objective-C++
How well is Objective-J documented? Is the documentation good enough to start using it seriously?


# 03 

In [60]:
titles = (_ for _ in df['Title'])
g = (d for d in nlp.pipe(titles) if len(matcher(d)) > 0)
for i in range(10):
    print(next(g))

How do I run Rake tasks within a Ruby script?
How can I create Prototype Methods (like JavaScript) in C#.Net?
Some kind of task manager for JavaScript in Firefox 3?
Best practices for managing and deploying large JavaScript apps
Create an encrypted ZIP file in Python
Code to ask yes/no question in javascript
Executing JavaScript from Flex: Is this javascript function dangerous?
What Javascript rich text editor will not break the browser's spellcheck?
How do I marshal a lambda (Proc) in Ruby?
Ruby Performance


### Extending Jupyter

In [63]:
from IPython.display import HTML as html_print

def style(s, bold = False):
    blob = f"<text>{s}<text>"
    if bold:
        blob = f"<b style='backgorund-color: #fff59d'>{blob}</b>"
    return blob

def html_generator(g, n = 10):
    blob = ""
    for i in range(n):
        doc = next(g)
        
        state = [[t, False] for t in doc]
        for idx, start, end in matcher(doc):
            for i in range(start, end):
                state[i][1] = True
        blob += style(' '.join([style(str(t[0]), bold = t[1]) for t in state]) + '<br>')
    return blob

In [64]:
titles = (_ for _ in df['Title'])
g = (d for d in nlp.pipe(titles) if len(matcher(d)) > 0)
html_print(html_generator(g, n = 10))

### Adding more languages

In [65]:
import spacy 
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

In [66]:
obj_c_pattern1 = [{'LOWER': 'objective'},
                  {'IS_PUNCT': True, 'OP': '?'},
                  {'LOWER': 'c'}]
obj_c_pattern2 = [{'LOWER': 'objectivec'}]

csharp_pattern1 = [{'LOWER': 'c'}, {'LOWER': '#'}]
csharp_pattern2 = [{'LOWER': 'c'}, {'LOWER': 'sharp'}]
csharp_pattern3 = [{'LOWER': 'c#'}]

fsharp_pattern1 = [{'LOWER': 'f'}, {'LOWER': '#'}]
fsharp_pattern2 = [{'LOWER': 'f'}, {'LOWER': 'sharp'}]
fsharp_pattern3 = [{'LOWER': 'f#'}]
 
dot_net_pattern = [{'LOWER': '.net'}]

php_pattern = [{'LOWER': 'php'}]

asp_net_pattern = [{'LOWER': {'IN': ['asp.net', 'asp']}}]

python_pattern = [{'LOWER': 'python'}]

lisp_pattern1  = [{'LOWER': 'lisp'}]
lisp_pattern2  = [{'LOWER': 'common'}, {'LOWER': 'lisp'}]

go_pattern1    = [{'LOWER': 'go', 'POS': {'NOT_IN': ['VERB']}}]
go_pattern2    = [{'LOWER': 'golang'}]

ruby_pattern   = [{'LOWER': 'ruby'}]

sql_pattern    = [{'LOWER': 'sql'}]

matlab_pattern = [{'LOWER': 'matlab'}]

perl_pattern   = [{'LOWER': 'perl'}]

html_pattern   = [{'LOWER': 'html'}]

css_pattern   = [{'LOWER': 'css'}]

js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

java_pattern   = [{'LOWER': 'java'}]

c_pattern      = [{'LOWER': 'c'}]

cpp_pattern    = [{'LOWER': 'c++'}]

matcher = Matcher(nlp.vocab)
matcher.add("OBJ_C_LANG", [obj_c_pattern1, obj_c_pattern2])
matcher.add("CSHARP_LANG", [csharp_pattern1, csharp_pattern2, csharp_pattern3])
matcher.add("FSHARP_LANG", [fsharp_pattern1, fsharp_pattern2, fsharp_pattern3])
matcher.add("DOT_NET_LANG", [dot_net_pattern])
matcher.add("PHP_LANG", [php_pattern])
matcher.add("ASP_NET_LANG", [asp_net_pattern])
matcher.add("PYTHON_LANG", [python_pattern])
matcher.add("LISP_LANG", [lisp_pattern1, lisp_pattern2])
matcher.add("GO_LANG", [go_pattern1, go_pattern2])
matcher.add("RUBY_LANG", [ruby_pattern])
matcher.add("SQL_LANG", [sql_pattern])
matcher.add("MATLAB_LANG", [matlab_pattern])
matcher.add("PERL_LANG", [perl_pattern])
matcher.add("HTML_LANG", [html_pattern])
matcher.add("CSS_LANG", [css_pattern])
matcher.add("JS_LANG", [js_pattern])
matcher.add("JAVA_LANG", [java_pattern])
matcher.add("C_LANG", [c_pattern])
matcher.add("CPP_LANG", [cpp_pattern])

## Actual labelling

In [67]:
titles = (_ for _ in df['Title'][:2000])
sum(1 for d in nlp.pipe(titles) if len(matcher(d)) > 0)

647

In [68]:
df.head()

Unnamed: 0,Id,Title
0,80,SQLStatement.execute() - multiple queries in o...
1,90,Good branching and merging tutorials for Torto...
2,120,ASP.NET Site Maps
3,180,Function for creating color wheels
4,260,Adding scripting functionality to .NET applica...
