In [1]:
from misc import loadProperties, loadWEKA

props = loadProperties('submitActionClass.properties')
(data, attr) = loadWEKA('youTubeLocationIDWeka.csv', limit=0)

#### Adjusting spacy's pipeline

In [2]:
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")

locations = [x for x in props] # Array of known locations from .properties
location_patterns = list(nlp.pipe(locations))

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
matcher.add("LOCATION", None, *location_patterns)

# Define the custom component
def location_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'LOCATION'
    # Overwrite the doc.ents with the matched spans
    doc.ents = [Span(doc, start, end, label="LOCATION") for match_id, start, end in matches]
    return doc

# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(location_component, before='ner')

#### Defining functions for pipeline

In [3]:
# Returns [videoId, [Title,Descr,Tags]]
def preprocessWekaData(item_original):
    item = [ item_original[0],[] ]
    
    # Going through title,descr,tags
    for i in range(2,5):
        if item_original[i]:
            item[1].append(item_original[i])
            
    return item

In [4]:
def onlyItemsWithPayload(item_original):
    if len(item_original[1]) > 0:
        return True
    return False

In [5]:
# Uses spacy to look up for location in a strings. Returns Array of matches
def nlpLocation(string):
    spacy_mathc = []
    for ent in nlp(string).ents:
        if ent.label_ == "GPE" or ent.label_ == "LOCATION":
            spacy_mathc.append(ent.text.lower())
    return spacy_mathc

def matchLocationV3(item_original):
    item = item_original.copy()
    locations = []
    
    # Going through potential data to analyze
    for text in item[1]:
        locations += nlpLocation(text.replace("'","")) # Title
      
    item.append(locations)
    return item

# matchLocationV3(data2[405])

In [6]:
def removeDuplicateLocation(item_original):
    item = item_original.copy()
    item[2] = list(set(item[2]))
    return item

In [7]:
def onlyItemsWithMatchedLocation(item_original):
    if len(item_original[2]) > 0:
        return True
    return False

In [8]:
def printItem(item_original):
    print(item_original)
    print()
    return item_original

In [9]:
def countStats(stats, item):
    stats["total"] += 1
    if len(item[1]) > 0:
        stats["hasDataToAnalyze"] += 1
    if len(item[2]) > 0:
        stats["identified"] += 1
    return stats
def printStats(stats):
    print(stats["total"], "items were processed in total.")
    print(stats["hasDataToAnalyze"], "of them had title,description or tags to analyze.")
    print(stats["identified"], "out of", stats["hasDataToAnalyze"], "were matched with potential location")

#### Initializing and running the pipeline

In [10]:
from customPipeline import Pipe
data = data[:1800]

pl = Pipe()
pl.addItemPipe(preprocessWekaData)
pl.addDataPipe(onlyItemsWithPayload)
pl.addItemPipe(matchLocationV3)
pl.addItemPipe(removeDuplicateLocation)
pl.addStatPipe(countStats, printStats, {"total":0, "hasDataToAnalyze":0, "identified":0})
pl.addDataPipe(onlyItemsWithMatchedLocation)
# pl.addItemPipe(printItem)
result = pl(data)

preprocessWekaData: 100%|██████████| 1800/1800 [00:00<00:00, 654962.02it/s]
onlyItemsWithPayload: 100%|██████████| 1800/1800 [00:00<00:00, 948818.30it/s]
matchLocationV3: 100%|██████████| 394/394 [00:03<00:00, 122.64it/s]
removeDuplicateLocation: 100%|██████████| 394/394 [00:00<00:00, 461865.78it/s]
countStats: 100%|██████████| 394/394 [00:00<00:00, 509733.43it/s]


394 items were processed in total.
394 of them had title,description or tags to analyze.
19 out of 394 were matched with potential location


onlyItemsWithMatchedLocation: 100%|██████████| 394/394 [00:00<00:00, 409961.74it/s]


In [11]:
result

[['19112010171749',
  ['Was on shore and saw a huge splashsaw a giant grey creature moving at surface like a powerful submarineit was fairly far outthen saw a huge flopping dorsal fin then a smaller second fin at first I thought 2 dolphins but it was an enormous fish swimming in the seathe largest I have ever seen in my lifeno one believed me but about 6 weeks laterthis weeka fisherman reported being terrified by a giant 40 sea monsterwith a huge open mouthcoming straight at his boat3 miles of the SW of Barbadoswhich is about 5 or 6 miles downwind from my beach His deion matched what I knew I sawhe said he tried clubbing itplease look into this quickly as it would be terrible for a creature like this to sufferthey are not known to be around Barbadosin recent times but I see Belize and Gulf of Mexico have a lot so there is no reason why one could not be over this side of the Caribbean'],
  ['sw', 'barbadoswhich', 'belize']],
 ['1912200793941',
  ['These images were taken off Mafia Islan

### Functions to work with GeoNames API

In [12]:
import requests

def geoNamesSearch(item_original):
    item = item_original.copy()
    results = {}
    
    # Going through each identified location
    for loc in item[2]:
        r = "http://api.geonames.org/search?type=json&fuzzy=0.4&formatted=true&maxRows=3&username=kirillovmr&style=short&q="+loc
        resp = requests.get(r).json()
        
        if len(resp['geonames']) > 0:
            results[loc] = []
            # Going through each result
            for geoItem in resp['geonames']:
                results[loc].append(geoItem)
    
    item.append(results)
    return item

In [13]:
def __areHierarchyLocationsCloseToEachOther(hierarchy):
    allowedDifference = 10
    deepestLocationsLat = []
    deepestLocationsLng = []
    averageLocation = [0,0]
    difference = [0,0]
    
    for item in hierarchy:
        lat = float(item[len(item)-1]['lat'])
        lng = float(item[len(item)-1]['lng'])
        deepestLocationsLat.append(lat)
        deepestLocationsLng.append(lng)
        averageLocation[0] += lat
        averageLocation[1] += lng
    
    # Calculating average
    numLocations = len(deepestLocationsLat)
    averageLocation[0] /= numLocations
    averageLocation[1] /= numLocations
    
    # Calculating difference
    for i in range(numLocations):
        difference[0] += (abs(deepestLocationsLat[i]) - abs(averageLocation[0])) ** 2
        difference[1] += (abs(deepestLocationsLng[i]) - abs(averageLocation[1])) ** 2
        
    # Checking actual difference with allowed
    for i in range(2):
        if difference[i] > allowedDifference:
            return False
    
    # Difference within allowed range
    return True

def remainMostSpecificLocations(item_original):
    item = item_original.copy()
    
    # Going through each key
    for identifiedLocationName in item[3]:
        # Going through each location object
        hierarchy = []
        for locObj in item[3][identifiedLocationName]:
            # Getting hierarchy for each location id
            r = "http://api.geonames.org/hierarchy?type=json&formatted=true&username=kirillovmr&style=short&geonameId="+str(locObj['geonameId'])
            resp = requests.get(r).json()
            hierarchy.append(resp['geonames'])
        print("Hierarchy for", identifiedLocationName, "close?", __areHierarchyLocationsCloseToEachOther(hierarchy))
    
    return item

# res3 = remainMostSpecificLocations(result2[0])

In [14]:
pl2 = Pipe()
pl2.addItemPipe(geoNamesSearch)
pl2.addItemPipe(remainMostSpecificLocations)
result2 = pl2(result[:1])

geoNamesSearch: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
remainMostSpecificLocations:   0%|          | 0/1 [00:00<?, ?it/s]

Hierarchy for sw close? False
Hierarchy for barbadoswhich close? False


remainMostSpecificLocations: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it]

Hierarchy for belize close? True





In [15]:
result2[0]

['19112010171749',
 ['Was on shore and saw a huge splashsaw a giant grey creature moving at surface like a powerful submarineit was fairly far outthen saw a huge flopping dorsal fin then a smaller second fin at first I thought 2 dolphins but it was an enormous fish swimming in the seathe largest I have ever seen in my lifeno one believed me but about 6 weeks laterthis weeka fisherman reported being terrified by a giant 40 sea monsterwith a huge open mouthcoming straight at his boat3 miles of the SW of Barbadoswhich is about 5 or 6 miles downwind from my beach His deion matched what I knew I sawhe said he tried clubbing itplease look into this quickly as it would be terrible for a creature like this to sufferthey are not known to be around Barbadosin recent times but I see Belize and Gulf of Mexico have a lot so there is no reason why one could not be over this side of the Caribbean'],
 ['sw', 'barbadoswhich', 'belize'],
 {'sw': [{'lng': '46.9093',
    'geonameId': 113847,
    'countryC

In [30]:
a = 3
sentence = "Was on shore and saw a huge splashsaw a giant grey creature moving at surface like a powerful submarineit was fairly far outthen saw a huge flopping dorsal fin then a smaller second fin at first I thought 2 dolphins but it was an enormous fish swimming in the seathe largest I have ever seen in my lifeno one believed me but about 6 weeks laterthis weeka fisherman reported being terrified by a giant 40 sea monsterwith a huge open mouthcoming straight at his boat3 miles of the SW of Barbadoswhich is about 5 or 6 miles downwind from my beach His deion matched what I knew I sawhe said he tried clubbing itplease look into this quickly as it would be terrible for a creature like this to sufferthey are not known to be around Barbadosin recent times but I see Belize and Gulf of Mexico have a lot so there is no reason why one could not be over this side of the Caribbean"
print(a)

3


In [32]:
ss = nlp(sentence)

In [34]:
for w in ss:
    print(w.text, w.lemma_)

Was be
on on
shore shore
and and
saw see
a a
huge huge
splashsaw splashsaw
a a
giant giant
grey grey
creature creature
moving move
at at
surface surface
like like
a a
powerful powerful
submarineit submarineit
was be
fairly fairly
far far
outthen outthen
saw see
a a
huge huge
flopping flop
dorsal dorsal
fin fin
then then
a a
smaller small
second second
fin fin
at at
first first
I -PRON-
thought think
2 2
dolphins dolphin
but but
it -PRON-
was be
an an
enormous enormous
fish fish
swimming swimming
in in
the the
seathe seathe
largest large
I -PRON-
have have
ever ever
seen see
in in
my -PRON-
lifeno lifeno
one one
believed believe
me -PRON-
but but
about about
6 6
weeks week
laterthis laterthis
weeka weeka
fisherman fisherman
reported report
being be
terrified terrify
by by
a a
giant giant
40 40
sea sea
monsterwith monsterwith
a a
huge huge
open open
mouthcoming mouthcoming
straight straight
at at
his -PRON-
boat3 boat3
miles mile
of of
the the
SW SW
of of
Barbadoswhich Barbadoswhich
is b