## Loading .properties file

In [1]:
# Reading .properties file, returns a dictionary
def loadProperties(filepath, sep='=', com='#'):
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(com):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = key_value[1].strip()
                props[key] = value
#                 try:
#                     props[value].append(key)
#                 except:
#                     props[value] = [key]
    return props

In [2]:
props = loadProperties('submitActionClass.properties')

# Thats how props looks now
# print('6b : ', props['6b'])
# print('1a1 : ', props['1a1'])
# print('6a2 : ', props['6a2'])
i = 0
for key in props:
    print(key, " - ", props[key])
    i += 1
    if i > 5:
        break

ningaloo  -  1a
exmouth  -  1a1
tantabiddi  -  1a1
coral\u0020bay  -  1a2
christmas  -  ChristmasIsland
indonesia  -  Indonesia


## Loading WEKA File

In [3]:
import csv

# Reading WEKA file, returning a dictionary
def loadWEKA(filename, limit=0):
    attributes = [] # Will store csv columns names here
    data = []       # Will store readed file here
    
    with open(filename, "r") as csv_file:
        reader = csv.reader(csv_file, delimiter=',', quotechar="'")
        
        line_num = 0
        dataBegan = False
        
        for row in reader:
            # Getting info from @ATTRIBUTE's
            if not dataBegan:
                if len(row) > 0:
                    # Extracting @ATTRIBUTE's
                    row_splitted = row[0].split()
                    if row_splitted[0] == "@ATTRIBUTE":
                        attributes.append(row_splitted[1])
                    
                    # If we found that @data started
                    if row[0] == "@data":
                        dataBegan = True
                        continue
            
            # Reading only payload of the file
            if dataBegan:
                if len(row) > 1:
                    # Appending row into data array
                    data.append(row)
                    line_num += 1

                    # Limiting the number of rows to read
                    if line_num > limit and limit > 0:
                        break
        
    return (data, attributes)

In [4]:
# Processing WEKA file
(data, attr) = loadWEKA('youTubeLocationIDWeka.csv', limit=0)

#### Thats how weka file looks like now. Array of Arrays:

In [5]:
data[:2]

[['1892013124614', '', '', '', '', 'East Contoy Island Mexico', '2c'],
 ['1892013125327', '', '', '', '', 'East Contoy Island Mexico', '2c']]

#### Thats the column names loadWEKA() extracted:

In [6]:
attr

['encounter', 'video', 'title', 'tags', 'description', 'location', 'class']

### Matching YT video titles with their location
Naive way looping through the title and description word by word, trying to match the location:

In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Uses apcy to look up for location in a strings. Returns Array of matches
def nlpLocation(string):
    spacy_mathc = []
    matches = 0
    
    for ent in nlp(string).ents:
        # print(ent.text, ent.start_char, ent.label_) #Print if needed 
        if ent.label_ == "GPE":
            spacy_mathc.append(ent.text)
            matches += 1

    return spacy_mathc

In [10]:
def matchLocation(data, props, description=False, output=False):
    entries_processed = 0
    matched = 0
    spacy_found = 0
    
    for item in data:
        # Working with YT videos only
        if item[1]:
            naive_match = [] # [[matches_in_title], [mathces_in_descr]]
            spacy_match = [] # [[identified_in_title], [identified_in_descr]]
            
            # Going through title word by word
            title_splitted = item[2].split()
            title_match = []
            for word in title_splitted:
                try:
                    if (props[word.lower()]):
                        title_match.append(word)
                except:
                    pass
                
            naive_match.append(title_match)
            
            # Use spacy if location was not matched in a title
            spacy_match.append(nlpLocation(item[2].replace("'","")))
            
            # Parse description as well
            if description:
                # Going through description word by word
                descr_splitted = item[4].split()
                descr_match = []
                for word in descr_splitted:
                    try:
                        if (props[word.lower()]):
                            descr_match.append(word)
                    except:
                        pass
                    
                naive_match.append(descr_match)
                    
                # Use spacy if location was not matched in a description
                spacy_match.append(nlpLocation(item[4].replace("'","")))
    
            # Counting statistics
            entries_processed += 1
            matched += 1 if sum(len(x) for x in naive_match)>0 else 0
            spacy_found += 1 if sum(len(x) for x in spacy_match)>0 else 0
            
            if output:
                print(item)
                print("Naive match", naive_match)
                print("Spacy match", spacy_match)
                print()

    print("Processed ", entries_processed, " videos")
    print("Naively matched ", matched, " locations")
    print("Identified by spacy ", spacy_found, " locations")
    
matchLocation(data[:11800], props, description=True, output=True) # Pass data[:11800] to see a small bunch of results

['01e44824-117c-42c4-90aa-09871b745e16', 'https://www.youtube.com/watch?v=J6JKpXsBWuk', 'Snorkeling in Thudufushi  Whale Shark Manta Sharks Turtles', 'MaldiveThudufushiAri AtollAtollo di AriWhale sharkSqualo balenaSharksSqualiMurenaMorayStingrayTrigoneAquila di mareRays fishPesce chirurgoSurgeonfishLion fishpesce leonepesce scorpioneTartarugheturtlesmantasnorkelinggopro hero 7 blackgopro hero 4 black', 'In this video you can see the Whale shark Sharks Manta Stingray Surgeonfish Lion fish Turtles Rays fish and much moreFilmed with GoPro Hero 7 Black and GoPro Hero 4 BlackIn questo video potrete vedere lo squalo balena diversi squali tartarughe pesci chirurgo mante trigone aquile di mare e molto altroFilmato con GoPro Hero 7 Black e GoPro hero 4 Black', 'Ari Atol Maldive', 'Maldives']
Naive match [[], []]
Spacy match [['Thudufushi'], ['Sharks Manta Stingray']]

['53fd808d-9759-47fc-ac18-63a4e3715afc', 'https://www.youtube.com/watch?v=sziYTviQa74', 'Requin baleine  Whale shark', '', '', '