In [1]:
from misc import loadProperties, loadWEKA

props = loadProperties('submitActionClass.properties')
(data, attr) = loadWEKA('youTubeLocationIDWeka.csv', limit=0)

#### Adjusting spacy's pipeline

In [2]:
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")

locations = [x for x in props] # Array of known locations from .properties
location_patterns = list(nlp.pipe(locations))

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
matcher.add("LOCATION", None, *location_patterns)

# Define the custom component
def location_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'LOCATION'
    # Overwrite the doc.ents with the matched spans
    doc.ents = [Span(doc, start, end, label="LOCATION") for match_id, start, end in matches]
    return doc

# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(location_component, before='ner')

#### Defining functions for pipeline

In [3]:
# Uses spacy to look up for location in a strings. Returns Array of matches
def nlpLocation(string):
    spacy_mathc = []
    for ent in nlp(string).ents:
        if ent.label_ == "GPE" or ent.label_ == "LOCATION":
            spacy_mathc.append(ent.text.lower())
    return spacy_mathc

def matchLocationV3(item_original):
    item = item_original.copy()
    locations = []

    locations += nlpLocation(item[2].replace("'","")) # Title
    locations += nlpLocation(item[3].replace("'","")) # Tags
    locations += nlpLocation(item[4].replace("'","")) # Descr
      
    item.append(locations)
    return item

# matchLocationV3(data2[405])

In [4]:
def removeDuplicateLocation(item_original):
    item = item_original.copy()
    item[7] = list(set(item[7]))
    return item

In [5]:
def printItem(item_original):
    print(item_original)
    print()
    return item_original

In [6]:
def onlyItemsWithPayload(item_original):
    if item_original[2] or item_original[3] or item_original[4]:
        return True
    return False

In [7]:
def onlyItemsWithMatchedLocation(item_original):
    if len(item_original[7]) > 0:
        return True
    return False

In [11]:
def countStats(acc, item):
    acc["total"] += 1
    if item[2] or item[3] or item[4]:
        acc["hasDataToAnalyze"] += 1
    if len(item[7]) > 0:
        acc["identified"] += 1
    return acc
def printStats(stats):
    print(stats["total"], "items were processed in total.")
    print(stats["hasDataToAnalyze"], "of them had title,description or tags to analyze.")
    print(stats["identified"], "out of", stats["hasDataToAnalyze"], "were matched with potential location")

#### Initializing and running the pipeline

In [10]:
from customPipeline import Pipe
data = data[:1800]

pl = Pipe()
pl.addDataPipe(onlyItemsWithPayload)
pl.addItemPipe(matchLocationV3)
pl.addItemPipe(removeDuplicateLocation)
pl.addStatPipe(countStats, printStats, {"total":0, "hasDataToAnalyze":0, "identified":0})
pl.addDataPipe(onlyItemsWithMatchedLocation)
# pl.addPipe(printItem)
result = pl(data)

onlyItemsWithPayload: 100%|██████████| 1800/1800 [00:00<00:00, 1223621.91it/s]
matchLocationV3: 100%|██████████| 394/394 [00:03<00:00, 113.91it/s]
removeDuplicateLocation: 100%|██████████| 394/394 [00:00<00:00, 661286.83it/s]
countStats: 100%|██████████| 394/394 [00:00<00:00, 536021.98it/s]


394 items were processed in total.
394 of them had title,description or tags to analyze.
19 out of 394 were matched with potential location


onlyItemsWithMatchedLocation: 100%|██████████| 394/394 [00:00<00:00, 944857.50it/s]


### Functions to work with GeoNames API

In [57]:
import requests

def geoNamesSearch(item_original):
    item = item_original.copy()
    results = []
    
    # Going through each identified location
    for loc in item[7]:
        r = "http://api.geonames.org/search?type=json&formatted=true&maxRows=3&username=kirillovmr&style=short&q="+loc
        resp = requests.get(r).json()
        
        if len(resp['geonames']) > 0:
            geoObj = {loc: []}
            # Going through each result
            for geoItem in resp['geonames']:
                geoObj[loc].append(geoItem)
            results.append(geoObj)
    
    item.append(results)
    return item

In [58]:
pl2 = Pipe()
pl2.addItemPipe(geoNamesSearch)
pl2(result[:1])

geoNamesSearch: 100%|██████████| 1/1 [00:00<00:00,  1.48it/s]


[['19112010171749',
  '',
  '',
  '',
  'Was on shore and saw a huge splashsaw a giant grey creature moving at surface like a powerful submarineit was fairly far outthen saw a huge flopping dorsal fin then a smaller second fin at first I thought 2 dolphins but it was an enormous fish swimming in the seathe largest I have ever seen in my lifeno one believed me but about 6 weeks laterthis weeka fisherman reported being terrified by a giant 40 sea monsterwith a huge open mouthcoming straight at his boat3 miles of the SW of Barbadoswhich is about 5 or 6 miles downwind from my beach His deion matched what I knew I sawhe said he tried clubbing itplease look into this quickly as it would be terrible for a creature like this to sufferthey are not known to be around Barbadosin recent times but I see Belize and Gulf of Mexico have a lot so there is no reason why one could not be over this side of the Caribbean',
  'South Coast of Barbados off Hastings',
  'null',
  ['belize', 'barbadoswhich', 's