In [39]:
from misc import loadProperties, loadWEKA

props = loadProperties('submitActionClass.properties')
(data, attr) = loadWEKA('youTubeLocationIDWeka.csv', limit=0)
data2 = data[11000:11800]

Adjusting spacy's pipeline

In [76]:
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")

locations = [x for x in props] # Array of known locations from .properties
location_patterns = list(nlp.pipe(locations))

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
matcher.add("LOCATION", None, *location_patterns)

# Define the custom component
def location_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'LOCATION'
    # Overwrite the doc.ents with the matched spans
    doc.ents = [Span(doc, start, end, label="LOCATION") for match_id, start, end in matches]
    return doc

# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(location_component, before='ner')

In [81]:
# Uses spacy to look up for location in a strings. Returns Array of matches
def nlpLocation(string):
    spacy_mathc = []
    for ent in nlp(string).ents:
        if ent.label_ == "GPE" or ent.label_ == "LOCATION":
            spacy_mathc.append(ent.text.lower())
    return spacy_mathc

def matchLocationV3(item_original):
    item = item_original.copy()
    locations = []

    # Filtering only items with title/descr/tags
    if item[2] or item[3] or item[4]:
        # Processing with spacy
        locations += nlpLocation(item[2].replace("'","")) # Title
        locations += nlpLocation(item[3].replace("'","")) # Tags
        locations += nlpLocation(item[4].replace("'","")) # Descr
      
    item.append(locations)
    return item

# matchLocationV3(data2[405])

In [69]:
def removeDuplicateLocation(item_original):
    item = item_original.copy()
    item[7] = list(set(item[7]))
    return item

In [72]:
def printItem(item_original):
    print(item_original)
    print()
    return item_original

In [82]:
def countStats(data):
    total = len(data)
    hasDataToAnalyze = 0
    identified = 0
    
    for item in data:
        if item[2] or item[3] or item[4]:
            hasDataToAnalyze += 1
        if len(item[7]) > 0:
            identified += 1
            
    print(total, "items were processed in total.")
    print(hasDataToAnalyze, "of them had title,description or tags to analyze.")
    print(identified, "out of", hasDataToAnalyze, "were matched with potential location")

In [84]:
from customPipeline import Pipe
pl = Pipe()

pl.addPipe(matchLocationV3)
pl.addPipe(removeDuplicateLocation)
# pl.addPipe(printItem)

result = pl(data2)

countStats(result)

800 items were processed in total.
192 of them had title,description or tags to analyze.
33 out of 192 were matched with potential location
