In [4]:
from misc import loadProperties, loadWEKA
from customPipeline import Pipe

props = loadProperties('submitActionClass.properties')
(data, attr) = loadWEKA('youTubeLocationIDWeka.csv', limit=0)

# Array of locations to search for
locations = [x for x in props]
revercedProps = {v: k for k, v in props.items()}

In [65]:
# Returns [videoId, [Title,Descr,Tags]]
def preprocessWekaData(item_original):
    item = {}
    for i in range(len(attr)):
        item[attr[i]] = item_original[i]
    return item

def onlyYouTubeVideos(item_original):
    if len(item_original['video']) > 0:
        return True
    return False

def keywordMatchLocation(item_original):
    item = item_original.copy()
    item['locationmatch'] = ''
    item['classmatch'] = ''
    
    for loc in locations:
        for key in ['title', 'description', 'tags']:
            foundIndex = item[key].lower().find(loc)
            if foundIndex > -1:
                if foundIndex > 0:
                    # Avoid "Scuba"
                    if loc == "cuba" and item[key][foundIndex-1].lower() == "s":
                        continue
                item['locationmatch'] += '{} '.format(loc)
                
                # Class Match
                if item['classmatch']:                    
                    # Prefer code over name
                    if not item['classmatch'][0].isdigit() and props[loc][0].isdigit():
                        item['classmatch'] = props[loc]
                        
                    # Prefer bigger last digit/letter in code
                    if len(item['classmatch']) == len(props[loc]):
                        # Digit
                        if item['classmatch'][len(item['classmatch'])-1].isdigit() and props[loc][len(props[loc])-1].isdigit():
                            if props[loc][len(props[loc])-1] > item['classmatch'][len(item['classmatch'])-1]:
                                item['classmatch'] = props[loc]
                        # Letter     
#                         if item['classmatch'][len(item['classmatch'])-1].isalpha() and props[loc][len(props[loc])-1].isalpha():
#                             if props[loc][len(props[loc])-1] > item['classmatch'][len(item['classmatch'])-1]:
#                                 item['classmatch'] = props[loc]

                    # Counting class "value" by sum of letter charcodes
                    else:
                        val1 = 0
                        for l in item['classmatch']:
                            val1 += ord(l)
                        val2 = 0
                        for l in props[loc]:
                            val2 += ord(l)
                        if val2 > val1:
                            item['classmatch'] = props[loc]
                
                # If no class were identified before
                else:
                    item['classmatch'] = props[loc]
    
#     # Appending general location
#     if len(item['classmatch']):
#         # If last character is digit
#         if item['classmatch'][len(item['classmatch'])-1].isdigit():
#             generalID = item['classmatch'][:len(item['classmatch'])-1]
#             if revercedProps[generalID]:
#                 generalLoc = revercedProps[generalID]
#                 # Removing same item if it is already exists
#                 generalLocMatch = item['locationmatch'].find(generalLoc)
#                 if generalLocMatch > -1:
#                     item['locationmatch'] = item['locationmatch'].replace(generalLoc, '')
#                 item['locationmatch'] += revercedProps[generalID]
    # If no match
#     else:
#         print(item)
    
    # Removing duplicated
    ulist = []
    [ulist.append("{} ".format(x)) for x in item['locationmatch'].split() if "{} ".format(x) not in ulist]
    item['locationmatch'] = ''.join(ulist)
    
    item['locationmatch'] = item['locationmatch'].strip()
                
#     if(item['locationmatch'].find(' ') > -1):
#         print(item, '\n')
        
    return item

def onlyMatchedVideos(item_original):
    if len(item_original['classmatch']) > 0:
        return True
    return False



### Jaccard Similarity doesnt really show us the right stats.

Sometimes we match more location patterns than found in WEKA file, therefore Jaccards Similarity goes down.

Also, in WEKA some locations have duplicate words while our match removes duplicates.

In [66]:
def doStats(data):
    classmatch = 0
    totalIntersection = 0
    totalIntersectionPercent = 0
    totalJaccardSimilarity = 0

    for item in res:
        item['stats'] = {}
        if item['classmatch'] == item['class'] or item['class'] == 'null':
            item['stats']['classmatch'] = True
            classmatch += 1

        Lours = item['locationmatch'].lower().split()
        Lweka = item['location'].lower().split()

        intersection = 0
        for word in Lours:
            try:
                Lweka.index(word)
                intersection += 1
            except:
                if intersection >= len(Lweka):
                    intersection += 1

        item['stats']['intersection'] = intersection
        item['stats']['union'] = len(set(Lours+Lweka))
        item['stats']['intersectionPercent'] = intersection/len(Lweka) if len(Lweka)>0 else 2
        item['stats']['jaccardSimilarity'] = item['stats']['intersection']/item['stats']['union'] if item['stats']['union']>0 else 1

        totalIntersection += item['stats']['intersection']
        totalIntersectionPercent += item['stats']['intersectionPercent']
        totalJaccardSimilarity += item['stats']['jaccardSimilarity']

    #     if item['stats']['jaccardSimilarity'] < 1:
    #         print(item, '\n')

    totalIntersectionPercent /= len(res)
    totalJaccardSimilarity /= len(res)

    print("Exact location code match {} out of {} items.".format(classmatch, len(res)))
    print("totalIntersection", totalIntersection)
    print("totalIntersectionPercent", totalIntersectionPercent)
    print("totalJaccardSimilarity", totalJaccardSimilarity)

In [67]:
pl = Pipe()
pl.addItemPipe(preprocessWekaData)
pl.addDataPipe(onlyYouTubeVideos)
pl.addItemPipe(keywordMatchLocation)
pl.addDataPipe(onlyMatchedVideos)
res = pl(data)

doStats(res)

preprocessWekaData: 100%|██████████| 38780/38780 [00:00<00:00, 292385.82it/s]
onlyYouTubeVideos: 100%|██████████| 38780/38780 [00:00<00:00, 1504616.93it/s]
keywordMatchLocation: 100%|██████████| 2580/2580 [00:00<00:00, 6591.15it/s]
onlyMatchedVideos: 100%|██████████| 2580/2580 [00:00<00:00, 1320476.43it/s]

Exact location code match 1722 out of 1795 items.
totalIntersection 2986
totalIntersectionPercent 1.0059543706061809
totalJaccardSimilarity 0.8830202944687631





In [8]:
# for item in res:
#     if item['class'] != item['classmatch']:
#         print(item)
#         print("location", item['location'])
#         print("locationmatch", item['locationmatch'])
#         print('class', item['class'])
#         print('classmatch', item['classmatch'])
#         print()

In [124]:
val = 0
for l in "Thailand":
    val += ord(l)
print(val)

val = 0
for l in "Malaysia":
    val += ord(l)
print(val)

805
817
