In [1]:
from misc import loadProperties, loadWEKA

props = loadProperties('submitActionClass.properties')
(data, attr) = loadWEKA('youTubeLocationIDWeka.csv', limit=0)
data = data[:1800]

Adjusting spacy's pipeline

In [2]:
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")

locations = [x for x in props] # Array of known locations from .properties
location_patterns = list(nlp.pipe(locations))

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
matcher.add("LOCATION", None, *location_patterns)

# Define the custom component
def location_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'LOCATION'
    # Overwrite the doc.ents with the matched spans
    doc.ents = [Span(doc, start, end, label="LOCATION") for match_id, start, end in matches]
    return doc

# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(location_component, before='ner')

In [3]:
# Uses spacy to look up for location in a strings. Returns Array of matches
def nlpLocation(string):
    spacy_mathc = []
    for ent in nlp(string).ents:
        if ent.label_ == "GPE" or ent.label_ == "LOCATION":
            spacy_mathc.append(ent.text.lower())
    return spacy_mathc

def matchLocationV3(item_original):
    item = item_original.copy()
    locations = []

    locations += nlpLocation(item[2].replace("'","")) # Title
    locations += nlpLocation(item[3].replace("'","")) # Tags
    locations += nlpLocation(item[4].replace("'","")) # Descr
      
    item.append(locations)
    return item

# matchLocationV3(data2[405])

In [4]:
def removeDuplicateLocation(item_original):
    item = item_original.copy()
    item[7] = list(set(item[7]))
    return item

In [5]:
def printItem(item_original):
    print(item_original)
    print()
    return item_original

In [6]:
def onlyDataWithPayload(data_original):
    data = []
    for item in data_original:
        if item[2] or item[3] or item[4]:
            data.append(item)
    return data

In [7]:
def countStats(data):
    total = len(data)
    hasDataToAnalyze = 0
    identified = 0
    
    for item in data:
        if item[2] or item[3] or item[4]:
            hasDataToAnalyze += 1
        if len(item[7]) > 0:
            identified += 1
            
    print(total, "items were processed in total.")
    print(hasDataToAnalyze, "of them had title,description or tags to analyze.")
    print(identified, "out of", hasDataToAnalyze, "were matched with potential location")

In [8]:
from customPipeline import Pipe
pl = Pipe()

data2 = onlyDataWithPayload(data)

pl.addPipe(matchLocationV3)
pl.addPipe(removeDuplicateLocation)
pl.addPipe(printItem)

result = pl(data2)

countStats(result)

matchLocationV3: 100%|██████████| 394/394 [00:03<00:00, 105.61it/s]
removeDuplicateLocation: 100%|██████████| 394/394 [00:00<00:00, 453874.15it/s]
printItem: 100%|██████████| 394/394 [00:00<00:00, 33370.13it/s]

['1910200691010', '', '', '', 'No other details known The pictures were taken with a Sea  Sea MX10 Camera so I dont know if theyre going to be clear enough for you', 'off the Coast of Coral Bay Western Australia when out on a Charter', '1a2', []]

['191020116527', '', '', '', 'whale shark come to fishermans boat and feed flankton it was trained like aquarium', 'oslob cebu philippine', '6a2', []]

['19112007171456', '', '', '', 'He came to us  he circled our snorketling day tour boat several times before we anchored and jumped in to meet him  After we played with him her and reboarded the boat he circled us again and headbutt the boat before he swam off encounter was at least 1 hour', 'Desecheo Island Puerto Rico', '2', []]

['1911200723313', '', '', '', 'the guide state that these shark migrate thru these water every year from november till february', 'Golf of tadjoura djibouti africa', 'Djibouti', []]

['191120078341', '', '', '', 'Small whale shark cruising over the reef', 'White San


