In [25]:
""" This module uses the spaCy library to detect what type of programming language
is being discussed using the titles of threads on stackoverflow.

dataset: https://www.kaggle.com/stackoverflow/stacksample/data
"""

' This module uses the spaCy library to detect what type of programming language\nis being discussed using the titles of threads on stackoverflow.\n\ndataset: https://www.kaggle.com/stackoverflow/stacksample/data\n'

In [3]:
import os
import pandas as pd
import spacy

In [6]:
# spacy.require_gpu() ... pip install spacy[cuda]
nlp = spacy.load('en_core_web_sm')

In [9]:
df = pd.read_csv(
    f'prog_lang_detecting_data/Questions.csv',
    nrows=1_000_000, usecols=['Title', 'Id'],
    encoding='ISO-8859-1'
)
titles = [title for title in df.loc[lambda d: d['Title'].str.lower().str.contains('go')]['Title']]

In [10]:
def has_golang(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                if t.dep_ == 'pobj':
                    return True
    return False

In [13]:
g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
x = [next(g) for i in range(5)]
x

[Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go]

In [24]:
df_tags = pd.read_csv('prog_lang_detecting_data/Tags.csv')
go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            return True
    return False

all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].tolist()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (
    df.loc[lambda d: ~d['Id'].isin(go_ids)]
    .loc[lambda d: d['Title'].str.lower().str.contains('go')]['Title'].tolist()
)

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

(1167, 762, 1345)