In [None]:
import os
import sys

PROJ_DIR = os.environ["WORKSPACE"]
PROJ_DIR = os.path.join(PROJ_DIR, "trending_NER")

if PROJ_DIR not in sys.path: 
    sys.path.insert(0, PROJ_DIR) 
    
PROJ_DIR

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

# 1. Load and prepare data

In [None]:
import os
import json
import jsonlines

from tqdm.auto import tqdm
from collections import Counter
from sklearn.utils import shuffle

import exp_ssl.src.commons.utilities as utils

In [None]:
data_path = {
    '2014'  : 'data/2014/train.txt',
    '2015'  : 'data/2015/train.txt',
    '2016'  : 'data/2016/train.txt',
    '2017'  : 'data/2017/train.txt',
    '2018'  : 'data/2018/train.txt',
}

In [None]:
datasets = {}
colnames = {'tokens': 0, 'labels': 1}

for split in data_path:
    file_path = os.path.join(PROJ_DIR, data_path[split])
    datasets[split] = utils.read_conll(file_path, colnames)

# 2. Extract random data

In [None]:
num_samples = 200
random_data = {}

for split in datasets: 
    tokens, labels = shuffle(datasets[split]['tokens'], datasets[split]['labels'])
    random_data[split] = {'tokens': tokens[:num_samples], 'labels': labels[:num_samples]}

# 3. Extract trending-related data

## 3.1 Define stopwords

In [None]:
stopwords = [',', '!', '?', '.', '#', '\'', '..', '...', '@', ':', '-', '&', '\"', '<', '>', '+', '*', '\\', ';', "/", ')', '(', '%', '$', '%', '^',
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
            'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'zero',
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 
            'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
            'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 
            'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
            'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
            'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
            'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 
            'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', 
            "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 
            'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
            "weren't", 'won', "won't", 'wouldn', "wouldn't", "'s", "'m", "n't", "'ve",
            'next', 'today', 'years', 'ago', 'feel', 'go', 'gonna', 'gt', 'gon', 'na', 'got', 'went', 'last', 'look', 'looks', 'like', 
            'get', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'morning', 'noon', 'afternoon', 'evening', 'night', 'day', 'week', 'month',
            'year', 'days', 'weeks', 'months', 'years', 'ago', 'hour', 'hours', 'minute', 'minutes', 'oh', 'well', 'really', 'come', 'u', 'ur', 
             'someone', 'every', 'may', 'cant', 'im', 'till', 'b', 'ha', 'yeah', 'time', 'til', "you're", 'even', 'times', 'sure', 'tonight', 'getting',
            'rt', 'lol', 'dont', 'wanna', 'would', 'much', 'back', 'yo', 'talking', 'looking', 'never', 'lo', 'que', 'eu', 'de', 'hari', 'ini', 'ada', 
             'el', 'e', 'w', 'already', 'many', 'people', 'please', 'en', 'x', 'say', 'anyone', 'else', 'anything', 'mau', 'te', 'amo', 'aja', 'deh', 
             'ever', 'good', 'apa', 'ga', 'aku', 'hell', 'yea', 'yeah', 'birthday', 'new', 'via', 'great', 'thank', 'thanks', 'say', 'says', 'see', 'going',
            'know', 'make', 'work', 'right', 'could', 'way', 'ways', 'help', 'using', 'use', 'first', 'always', 'show', 'think', 'tomorrow', 'better',
            'best', 'know', 'need', 'still', 'us', 'let', 'love', 'man', 'happy', 'read', 'try', 'take', 'said', 'support', 'live', 'news', 'hi', 'made',
            'want', 'wanna']


def filter_word(word):
    if  word.encode( 'UTF-8' ).isalpha() and word.lower() not in stopwords:
        return True
    else:
        return False

## 3.2 Compute n-grams

In [None]:
for split in datasets:
    sentences = datasets[split]['tokens']
    ngram = []
    for tokens in sentences:
        for i in range(len(tokens) - 1):
            if filter_word(tokens[i]) and filter_word(tokens[i+1]):
                ngram.append(" ".join(tokens[i: i+2]))
    ngram_counter = Counter(ngram)
    sorted_ngram = [(k, v) for k, v in sorted(ngram_counter.items(), key=lambda item: item[1], reverse=True)]
    datasets[split]['ngram'] = sorted_ngram

## 3.3 Compute trending n-grams

In [None]:
def compute_trending_ngram(old_split, new_split):
    datasets[new_split]['trending_ngram'] = {}

    for k, v in datasets[new_split]['ngram']:
        new_split_num = v
        if k in datasets[old_split]['ngram']:
            old_split_num = datasets[old_split]['ngram'][k]
        else:
            old_split_num = 0

        trending_score = (new_split_num - old_split_num) / (old_split_num + 5)
        if trending_score > 0.1:
            datasets[new_split]['trending_ngram'][k] = trending_score

    return datasets

In [None]:
def extract_trending_data(split, max_samples=200):
    trending_data = {'tokens': [], 'labels': [], 'scores': []}
    
    for i, tokens in enumerate(random_data[split]['tokens']):
        
        trends = len(tokens) * [0]
        scores = len(tokens) * [0]
        year = len(tokens) * [split]
        
        for j in range(len(tokens) - 1):
            ngram = " ".join(tokens[j: j+2])
            if ngram in datasets[split]['trending_ngram']:
                trends[j] = 1
                trends[j+1] = 1
                
                scores[j] += datasets[split]['trending_ngram'][ngram]
                scores[j+1] += datasets[split]['trending_ngram'][ngram]
                
        
        sent_score = sum(scores)
        
        if sent_score > 0:
            trending_data['tokens'].append(random_data[split]['tokens'][i])
            trending_data['labels'].append(random_data[split]['labels'][i])
            trending_data['scores'].append(scores)
                
    zipped_data = zip(trending_data['tokens'], trending_data['labels'], trending_data['scores'])
    sorted_data = sorted(zipped_data, key=lambda item: sum(item[2]), reverse=True)[:max_samples]
    data_list = list(map(list, zip(*sorted_data)))
    results = {'tokens': data_list[0], 'labels': data_list[1], 'scores': data_list[2]}
    
    assert all([len(results['tokens'][i]) == len(results['labels'][i]) for i in range(len(results['tokens']))])
    
    return results

## 3.4 Extract trending-related samples

### 3.4.1 2014

In [None]:
datasets['2014']['trending_ngram'] = {}

for k, v in datasets['2014']['ngram']:
    trending_score = v / 5
    if trending_score > 0.1:
        datasets['2014']['trending_ngram'][k] = trending_score

print(len(datasets['2014']['trending_ngram']))
print(list(datasets['2014']['trending_ngram'])[:10])

In [None]:
trend_2014 = extract_trending_data('2014')
len(trend_2014['tokens'])

In [None]:
write_path = 'data/2019/random/STT/2014/train.txt'
write_data = {'tokens': trend_2014['tokens'], 'labels': trend_2014['labels'],}
utils.write_conll(os.path.join(PROJ_DIR, write_path), write_data)

### 3.4.2 2015

In [None]:
datasets = compute_trending_ngram('2014', '2015')
print(len(datasets['2015']['trending_ngram']))
print(list(datasets['2015']['trending_ngram'])[:10])

In [None]:
trend_2015 = extract_trending_data('2015')
len(trend_2015['tokens'])

In [None]:
write_path = 'data/2019/random/STT/2015/train.txt'
write_data = {'tokens': trend_2015['tokens'], 'labels': trend_2015['labels']}
utils.write_conll(os.path.join(PROJ_DIR, write_path), write_data)

### 3.4.3 2016

In [None]:
datasets = compute_trending_ngram('2015', '2016')
print(len(datasets['2016']['trending_ngram']))
print(list(datasets['2016']['trending_ngram'])[:10])

In [None]:
trend_2016 = extract_trending_data('2016')
len(trend_2016['tokens'])

In [None]:
write_path = 'data/2019/random/STT/2016/train.txt'
write_data = {'tokens': trend_2016['tokens'], 'labels': trend_2016['labels'],}
utils.write_conll(os.path.join(PROJ_DIR, write_path), write_data)

### 3.4.4 2017

In [None]:
datasets = compute_trending_ngram('2016', '2017')
print(len(datasets['2017']['trending_ngram']))
print(list(datasets['2017']['trending_ngram'])[:10])

In [None]:
trend_2017 = extract_trending_data('2017')
len(trend_2017['tokens'])

In [None]:
write_path = 'data/2019/random/STT/2017/train.txt'
write_data = {'tokens': trend_2017['tokens'], 'labels': trend_2017['labels']}
utils.write_conll(os.path.join(PROJ_DIR, write_path), write_data)

### 3.4.5 2018

In [None]:
datasets = compute_trending_ngram('2017', '2018')
print(len(datasets['2018']['trending_ngram']))
print(list(datasets['2018']['trending_ngram'])[:10])

In [None]:
trend_2018 = extract_trending_data('2018')
len(trend_2018['tokens'])

In [None]:
write_path = 'data/2019/random/STT/2018/train.txt'
write_data = {'tokens': trend_2018['tokens'], 'labels': trend_2018['labels']}
utils.write_conll(os.path.join(PROJ_DIR, write_path), write_data)

## 3.5 Process all trending data

In [None]:
trending_data = {'tokens': [], 'labels': []}
trending_data['tokens'] = trend_2014['tokens'] + trend_2015['tokens'] + trend_2016['tokens'] + trend_2017['tokens'] + trend_2018['tokens']
trending_data['labels'] = trend_2014['labels'] + trend_2015['labels'] + trend_2016['labels'] + trend_2017['labels'] + trend_2018['labels']

print(len(trending_data['tokens']))
print(len(trending_data['labels']))

In [None]:
write_path = 'data/2019/random/TF/train.txt'
trending_data['tokens'], trending_data['labels'] = shuffle(trending_data['tokens'], trending_data['labels'])
write_data = {'tokens': trending_data['tokens'], 'labels': trending_data['labels']}
utils.write_conll(os.path.join(PROJ_DIR, write_path), write_data)