# Prepare test data for annotation

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#standard libs
import os, sys
from pathlib import Path
from pprint import pprint
import random
import json
import itertools
from datetime import datetime as dt
# ds libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook
# custom path
os.chdir('..')

## Load Data

In [2]:
FILES = [
    'data/external/dc0130-input.txt',
    'data/external/dc0202-input.txt',
    'data/external/dc0206-input.txt',
]

### Load

In [3]:
def load_test_file(filepath):
    with open(filepath) as f:
        test_data = f.read().split('\n')

    test_data = list(filter(lambda x: x != '', test_data))
    test_data = list(map(lambda x: json.loads(x), test_data))
    print('Loaded',len(test_data), 'rows')
    return test_data

In [4]:
test_data = [load_test_file(f) for f in FILES]

Loaded 50297 rows
Loaded 50050 rows
Loaded 33022 rows


### Check if titles match

In [5]:
if len(test_data) > 1:
    for i,j in itertools.combinations(range(3), r=2):
        titles1 = set(t['title'] for t in test_data[i])
        titles2 = set(t['title'] for t in test_data[j])
        matches = titles1.intersection(titles2)
        print('Indices:',i,j)
        print('Unique titles:', len(titles1), len(titles2))
        print('Intersection:', len(matches), end='\n\n')

Indices: 0 1
Unique titles: 49810 49450
Intersection: 22153

Indices: 0 2
Unique titles: 49810 32699
Intersection: 12505

Indices: 1 2
Unique titles: 49450 32699
Intersection: 13458



### Join

In [6]:
test_data = sum(test_data, [])

len(test_data)

133369

In [8]:
import torch

In [9]:
TGCAT_FILES = {
        'ru': 'models/trained/tgcat/ru_tgcat.pt',
        'en': 'models/trained/tgcat/en_tgcat.pt',
    }

tgcat = {l: torch.jit.load(f) for l,f in TGCAT_FILES.items()}

## Detect target languages

In [60]:
import fasttext
import re

In [61]:
TARGET_LANGS = ['en','ru']

### Load a model

In [63]:
lang_detector = fasttext.load_model('./models/external/lid.176.bin')

In [64]:
def format_text(channel):
    """ format text for running language detection """
    formatted = '\n'.join([channel['description'], '\n'.join(channel['recent_posts'])]).strip().replace('\n',' ')
    return formatted


def is_kz_lang(text):
    ''' Returns True if text contains kz or uz chars '''
    kz_chars = "[ЎўҒғҲҳҚқ]"
    return re.search(kz_chars, text) is not None


def predict_language(formatted_text):
    predicted = lang_detector.predict(formatted_text)
    lang_code = predicted[0][0].replace('__label__', '')
    if lang_code == 'ru' and is_kz_lang(formatted_text):
        lang_code = 'kz'
    return lang_code

In [65]:
d = random.choice(test_data)
f = format_text(d)
l = predict_language(f)
print(l)
print(f)

en
🇦🇺 Big Bash League 🇦🇺 🕹 Brisbane 🆚 Renegades (01:45PM)  Per Match cost :2000  Payment details:- Phone pay :6366045947  No Hi hello Demo we won't Replay ❌ Just do paymant and send me screenshot   Contact : 6366045947  After payment send screenshots on this  Number 💥🇧 ⭕️ ⭕️  🇲    🇧 ⭕️ ⭕️ 🇲 💥    Clean Sweep😍😍😍😘😘😘 👉 Otago Toss pass ✔️✔️ 👉 Wellington Match pass ✔️✔️ 👉 Srilanka Toss pass ✔️✔️ Today all matches clean Sweep😍😍 Super Smash 2020.. Match No.14   Canterbury♈️ Auckland  *Match Winner: Auckland* 💯  101% Sure Match.. Loss  Cover Match.. Kamane Wala Match.. 🏆🏆Bigbash -2020🏆🏆 ➖➖➖➖➖➖➖     🔰32 T20 match 🔰  🦅 Bresbane🆚 Star's🦅   Match  winner👇👇  🔥 Bresbane Heat🔥  10000% Sure Shot Match #Eagle_Cricket_Prediction @Cricket_Prediction_toss_Match 🏆🏆Bigbash -2020🏆🏆 ➖➖➖➖➖➖➖     🔰31 T20 match 🔰  🦅 Hobart🆚 Thunder🦅   Match  winner👇👇  🔥 Hobart Hurricane🔥  10000% Sure Shot Match #Eagle_Cricket_Prediction @Cricket_Prediction_toss_Match 🏆🏆Bigbash -2020🏆🏆 ➖➖➖➖➖➖➖     🔰29 T20 match 🔰  🦅 Sixers🆚 Perth🦅

### run texts

In [66]:
target = []
for data in tqdm_notebook(test_data, desc='channels'):
    formatted = format_text(data)
    lang_code = predict_language(formatted)
    data.update({'lang_code': lang_code})
    if lang_code in TARGET_LANGS:
        target.append(data)
        
len(target)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


channels:   0%|          | 0/133369 [00:00<?, ?it/s]

62182

#### COmpare c++ to python language predictions

## Active learning

- [Reference](https://towardsdatascience.com/learn-faster-with-smarter-data-labeling-15d0272614c4)


Final scoring:
$Score = U * I * D $

Uncertainity: 
$U = 1 - p(x)$

Information density:
$I = \frac{1}{|X|} \sum{||x-x_j||}$

Diversity:
$D = \max{x_j} belongs to {||x - x_j||}$

### Load models

### Prepare texts

### save tasks

In [67]:
for t in target:
    t['posts'] = '\n\n======================= NEXT POST =======================\n\n'.join(t['recent_posts'])

In [74]:
with open('data/interim/labelling_tasks_sample.json', 'w') as f:
    json.dump(random.sample(target, k=2000), f)

## Save data and configs

In [69]:
from lxml import etree

### generate and save configs

In [70]:
TOPICS = """
Art & Design
Bets & Gambling
Books
Business & Entrepreneurship
Cars & Other Vehicles
Celebrities & Lifestyle
Cryptocurrencies
Culture & Events
Curious Facts
Directories of Channels & Bots
Economy & Finance
Education
Erotic Content
Fashion & Beauty
Fitness
Food & Cooking
Foreign Languages
Health & Medicine
History
Hobbies & Activities
Home & Architecture
Humor & Memes
Investments
Job Listings
Kids & Parenting
Marketing & PR
Motivation & Self-Development
Movies
Music
Offers & Promotions
Pets
Politics & Incidents
Psychology & Relationships
Real Estate
Recreation & Entertainment
Religion & Spirituality
Science
Sports
Technology & Internet
Travel & Tourism
Video Games
Other
""".strip().split('\n')



len(TOPICS)

In [20]:
xml = etree.Element('View')
# headers
_ = etree.SubElement(xml, "Header", size='4', value=f'Title: $title')
_ = etree.SubElement(xml, "Header", size='4', value=f'Description: $description')
_ = etree.SubElement(xml, 'Text', name="posts", value="$posts")
# primary
_ = etree.SubElement(xml, "Header", size="6",
                          value="Primary topics")
_ = etree.SubElement(xml, "Filter", name="filter_primary",
                      toName='primary', hotkey="shift+f",
                      minlength="1", )
primary = etree.SubElement(xml, "Choices", name="primary", toName="posts",
                          showInline="true", choice='multiple', required='true')
_ = [etree.SubElement(primary, 'Choice', value=s, )
     for s in TOPICS]
# secondary
_ = etree.SubElement(xml, "Header", size="6",
                          value="Secondary topics")
_ = etree.SubElement(xml, "Filter", name="filter_secondary",
                      toName='secondary',
                      minlength="1", )
secondary = etree.SubElement(xml, "Choices", name="secondary", toName="posts",
                          showInline="true", choice='multiple', required='false')
_ = [etree.SubElement(secondary, 'Choice', value=s, ) for s in TOPICS]
# pretty string
etree.indent(xml, space='  ')
config = etree.tostring(xml, pretty_print=True,).decode('utf-8')

In [21]:
with open('data/interim/labelling_config.xml', 'w') as f:
    f.write(config)

print(config)

<View>
  <Header size="4" value="Title: $title"/>
  <Header size="4" value="Description: $description"/>
  <Text name="posts" value="$posts"/>
  <Header size="6" value="Primary topics"/>
  <Filter name="filter_primary" toName="primary" hotkey="shift+f" minlength="1"/>
  <Choices name="primary" toName="posts" showInline="true" choice="multiple" required="true">
    <Choice value="Art &amp; Design"/>
    <Choice value="Bets &amp; Gambling"/>
    <Choice value="Books"/>
    <Choice value="Business &amp; Entrepreneurship"/>
    <Choice value="Cars &amp; Other Vehicles"/>
    <Choice value="Celebrities &amp; Lifestyle"/>
    <Choice value="Cryptocurrencies"/>
    <Choice value="Culture &amp; Events"/>
    <Choice value="Curious Facts"/>
    <Choice value="Directories of Channels &amp; Bots"/>
    <Choice value="Economy &amp; Finance"/>
    <Choice value="Education"/>
    <Choice value="Erotic Content"/>
    <Choice value="Fashion &amp; Beauty"/>
    <Choice value="Fitness"/>
    <Choice val