# Prepare test data for annotation

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#standard libs
import os, sys
from pathlib import Path
from pprint import pprint
import random
import json
import itertools
from datetime import datetime as dt
# ds libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook
# custom path
os.chdir('..')

In [72]:
from collections import Counter
import re

In [2]:
from src.train.predict import prepare_texts, load_models, load_test_file, format_text, predict_language


In [35]:
get_file_lang = lambda fp: Path(fp).stem.split('-')[2]

## New randomly selected tasks

In [36]:
FILES = [
    'data/external/dc0415-input/original/dc0415-input-ru.txt',
    'data/external/dc0415-input/original/dc0415-input-en.txt',
    'data/external/dc0415-input/translated/dc0415-input-ar-translated.txt',
    'data/external/dc0415-input/translated/dc0415-input-fa-translated.txt',
    'data/external/dc0415-input/translated/dc0415-input-uz-translated.txt',
]

### Load

In [37]:
test_data = {
    get_file_lang(f): random.sample(load_test_file(f, verbose=False), 100)
    for f in FILES
            }

for l,d in test_data.items():
    print(l, len(d))

ru 100
en 100
ar 100
fa 100
uz 100


### explore

In [42]:
l = random.choice(list(test_data.keys()))
d = random.choice(test_data[l])

print(f"Language {l.upper()}")
pprint(d)

Language UZ
{'counters': {'audios': 129,
              'files': 92,
              'photos': 268,
              'posts': 3228,
              'videos': 71},
 'description': 'Assalomu alaykum ‚ùó Here you will have EngÔ∏è 1Ô∏è‚É£ Latest Tests '
                'Qu‚úÖ 2Ô∏è‚É£ Best Quizzesüí°‚úÖ 3Ô∏è‚É£ Most Necessary Guides üìë‚úÖ 4Ô∏è‚É£ '
                'Latest Information atsiya‚úÖ 5Ô∏è‚É£ Best Motivations üìàüóù‚úÖ Contact : '
                '@Faylasufginam Tel number: +998993433211',
 'recent_posts': [{'text': 'üé≤ ‚ÄúGRADE 11 UZBEK HISTORY CHANNEL: '
                           '@HISTORY_QUIZ_UZ Facts: @HISTORY_MEMORY Channel: '
                           '@GRANT_UZ Compiler: @ Jurist_19‚Äù test üñä 15 '
                           'questions ¬∑ ‚è± 30 seconds',
                   'type': 'text'},
                  {'text': 'üé≤ ‚ÄúüìäGRANT FOR HISTORIANS Channel @ History_Quiz_Uz '
                           'Compiler: @Dilqushim‚Äù test test 10 questions ¬∑ ‚è± '
          

### Convert to tasks

In [135]:
def update_dict(src_dict,dest_dict):
    src_dict.update(dest_dict)
    return src_dict


def get_task_stats(tasks):
    print('# tasks', len(tasks))
    by_lang = Counter(t['lang_code'] for t in tasks)
    pprint(by_lang)

In [136]:
tasks = [update_dict(one, {'lang_code':lang}) for lang,channels in test_data.items() for one in channels]



In [137]:
get_task_stats(tasks)

# tasks 500
Counter({'ru': 100, 'en': 100, 'ar': 100, 'fa': 100, 'uz': 100})


In [138]:
random.choice(tasks)

{'title': 'Tehran Times | Tehran timse',
 'description': 'üîªThe most up-to-date news of Iran and the world ÿ®ÿßÿ∑ Contact with reporters (regarding posts and content) @Ma_matrix',
 'subscribers': 78820,
 'counters': {'posts': 43545,
  'photos': 22951,
  'videos': 7894,
  'audios': 14,
  'files': 41},
 'recent_posts': [{'type': 'photo',
   'text': 'üåÄ The bell of Shams Al-Amara clock rang in Golestan Palace üîπ The old clock of Tehran Golestan Palace, which is mounted on the Shams Al-Amara tower, sounded again after many years. üîπ The sound of the bell of this old clock, which is the gift of Queen Victoria, Queen of England to Nasser al-Din Shah Qajar, wrapped up in the 12th district of Tehran this morning, Monday, April 13, 1400, and caused happiness. üîπ The reason for the failure of the bell was its loud ringing and disturbing the residents of Golestan Palace. At the order of Nasser al-Din Shah, they try to turn down the alarm, but completely destroy it and turn off the clock

### save tasks

In [139]:
today_str = dt.strftime(dt.today(), '%Y%m%d')
print(today_str)

20210420


In [140]:
random.shuffle(tasks)

with open(f'data/processed/labelling_tasks_{today_str}.json', 'w') as f:
    json.dump(tasks, f)

## TG labelled data

for reference when labelling

In [141]:
LABELLED_FILES = [
    {
        'input': 'data/external/submission1489/category_en/input.txt',
        'output': 'data/external/submission1489/category_en/tg_output.txt'
    },
    {
        'input': 'data/external/submission1489/category_ru/input.txt',
        'output': 'data/external/submission1489/category_ru/tg_output.txt'
    },
]

In [142]:
get_lang = lambda fp: Path(fp).parent.stem.split('_')[1]

### load files

In [143]:
labelled = {
    get_lang(files['input']): {k: load_test_file(file) for k,file in files.items()}
    for files in LABELLED_FILES
}

Loaded 14775 rows
Loaded 14775 rows
Loaded 16726 rows
Loaded 16726 rows


In [144]:
l = random.choice(list(labelled.keys()))

i = random.randint(0, len(labelled[l]['input']))
inp = labelled[l]['input'][i]
out = labelled[l]['output'][i]

print('Lang', l.upper(),)
print('\nINPUT:')
pprint(inp)
print('\nOUTPUT:')
pprint(out)

Lang RU

INPUT:
{'description': '',
 'recent_posts': ['–ò–≥–æ—Ä—å –µ–±*–ª–æ —Ä–∞–∑–æ–±—å—ë—Ç –°–ê–ù–ï –ë–ò '
                  'https://www.youtube.com/watch?v=lgYoR2YTIUU',
                  '–ò–≥–æ—Ä—å –∏ –ú–∞—Ä–∏–Ω–∞ —Ä–∞—Å—Å—Ç–∞–Ω—É—Ç—Å—è ? '
                  'https://www.youtube.com/watch?v=efeurlPH1yA',
                  '–ú–µ–ª—Å—Ç—Ä–æ–π –ë–û–ò–¢–°–Ø –ê–•–ê–•–•–ê '
                  'https://www.youtube.com/watch?v=omVn4AMT6MU',
                  'MELLSTROY —É–¥–∞–ª–∏–ª –ò–ì–û–†–Æ –ö–ê–ù–ê–õ? '
                  'https://www.youtube.com/watch?v=f8bPe-8KPXM',
                  '–ò–ì–û–†–¨ –ü–û–°–ê–î–ò–¢ –ú–ï–õ–°–¢–†–û–Ø '
                  'https://www.youtube.com/watch?v=eeW_333eUC8',
                  '100 —Ä –ø—Ä–æ—Å—Ç–æ —Ç–∞–∫ ? –ó–∞–ª–µ—Ç–∞–π –ø–æ —Å—Å—ã–ª–∫–µ –Ω–∞ 1 win '
                  '‚ù§Ô∏è‚ù§Ô∏è‚ù§Ô∏èhttps://1wiab.top/#j4bv   –†–µ–≥–∞–π—Å—è , –≤–≤–æ–¥–∏ –ø—Ä–æ–º–æ BASS '
                  '–ò –°–û–¢–ö–ê –£ –¢–ï–ë–Ø –í –ö–ê–†–ú–ê–ù–ï ( –≤—ã–≤–æ–¥–∏ —Å—Ä–∞–∑—É) \n'
 

### Format to dialogs

In [149]:
def format_input(inp):
    inp['recent_posts'] = [{'text':p, 'author':'xxx'} for p in inp['recent_posts']]
    return inp

In [150]:
reference_tasks = [
    update_dict(format_input(inp), out)
   for lang,data in labelled.items()
    for inp,out in zip(data['input'], data['output'])
]

In [151]:
random.choice(reference_tasks)

{'title': '–ù–µ —Ñ–µ–π—Å–±—É–∫',
 'description': '–¥–ª—è —Ä–∞–¥–æ—Å—Ç–µ–π –∏ –Ω—ã—Ç—å—è, –∏–Ω—Å—Ç–∞–≥—Ä–∞–º https://www.instagram.com/farkashalina/?hl=ru',
 'recent_posts': [{'text': '–≤ –∫–∞–∫–æ–π-—Ç–æ –º–æ–º–µ–Ω—Ç —É –º–µ–Ω—è —Å–±–∏–ª—Å—è –ø—Ä–∏—Ü–µ–ª –ø–æ–ª–Ω–æ—Å—Ç—å—é. —è —Ç–∞–∫ –¥–æ–ª–≥–æ –¥—É–º–∞–ª–∞ –æ —Ä–µ–º–æ–Ω—Ç–µ, —á—Ç–æ –∫–∞—Ä—Ç–∏–Ω–∫–∞ –Ω–∞—á–∞–ª–∞ —Ä–∞—Å–ø–∞–¥–∞—Ç—å—Å—è –Ω–∞ –ø–∏–∫—Å–µ–ª–∏, —Å—Ç–∞–ª–æ –≤–æ–æ–±—â–µ –Ω–µ –ø–æ–Ω—è—Ç–Ω–æ, —á—Ç–æ —è —Ö–æ—á—É, –∫–∞–∫ –¥–æ–ª–∂–Ω–∞ –≤—ã–≥–ª—è–¥–µ—Ç—å –∫–≤–∞—Ä—Ç–∏—Ä–∞... –°–º–æ—Ç—Ä–µ–ª–∞ –∑–∞–Ω–æ–≤–æ —Ç—ã—Å—è—á–∏ –∫–∞—Ä—Ç–∏–Ω–æ–∫ ‚Äì –∏ –Ω–µ —Å–æ–±–∏—Ä–∞–ª–æ—Å—å —É –º–µ–Ω—è. –ü–æ–∫–∞ —è –Ω–µ –Ω–∞—à–ª–∞ –∏–¥–µ–∞–ª—å–Ω—ã–π –æ–±—Ä–∞–∑. –Ø –Ω–µ —à—É—á—É! –°—Ç–∏–ª–∏—Å—Ç–∏—á–µ—Å–∫–∏ –±—É–¥–µ—Ç –ø—Ä–∏–º–µ—Ä–Ω–æ —Ç–∞–∫, –Ω–æ –ø–æ—Å–≤–µ—Ç–ª–µ–µ )',
   'author': 'xxx'},
  {'text': '–ù—É, –∞ –≤–æ–æ–±—â–µ –ø—Ä–∏—Ö–æ–¥–∏—Ç–µ –∫ –≤ –∫–ª–∞–±—Ö–∞—É—Å –∫–æ –º–Ω–µ, –ø–æ–±–æ–ª—Ç–∞–µ–º. –¢–æ–ª—å–∫–æ –Ω–∞–¥–æ –≤—ã–±—Ä–∞—Ç—å ‚Äì —Å–µ–∫—Å –∏–ª–∏ —Ä–µ–º–

In [152]:
get_task_stats(reference_tasks)

# tasks 31501
Counter({'ru': 16726, 'en': 14775})


### Save 

In [153]:
random.shuffle(reference_tasks)

with open(f'data/processed/labelled_tgdata_{today_str}.json', 'w') as f:
    json.dump(reference_tasks, f)

## Save data and configs

In [94]:
from lxml import etree

### generate and save configs

In [95]:
def clean_topic(orig):
    orig = re.split(' [‚Äì-] ', orig)[0].strip()
    return orig

In [96]:
TOPICS = """
    Art & Design
    Bets & Gambling ‚Äì includes sports bets
    Books
    Business & Entrepreneurship
    Cars & Other Vehicles
    Celebrities & Lifestyle
    Cryptocurrencies
    Culture & Events
    Curious Facts
    Directories of Channels & Bots
    Drug Sale
    Economy & Finance
    Education
    Erotic Content
    Fashion & Beauty
    Fitness
    Forgery ‚Äì includes fake documents, fake money, etc.
    Food & Cooking
    Foreign Language Learning
    Hacked Accounts & Software ‚Äì includes carding, passwords for subscription services, etc.
    Health & Medicine
    History
    Hobbies & Activities
    Home & Architecture
    Humor & Memes
    Investments
    Job Listings
    Kids & Parenting
    Marketing & PR
    Motivation & Self-development - includes inspirational quotes and poetry
    Movies
    Music
    Offers & Promotions ‚Äì includes products or services for sale, unless they fall under the newly added categories
    Personal Data ‚Äì includes doxxing, databases
    Pets
    Pirated Content ‚Äì films, music, books, but not software
    Politics & Incidents
    Prostitution
    Psychology & Relationships
    Real Estate
    Recreation & Entertainment
    Religion & Spirituality
    Science
    Spam & Fake Followers ‚Äì includes spam tools and services, boosting followers, likes, etc.
    Sports ‚Äì includes e-sports
    Technology & Internet
    Travel & Tourism
    Video Games
    Weapon Sale
    Other
""".strip().split('\n')

TOPICS = [clean_topic(t) for t in TOPICS]

len(TOPICS)

50

In [97]:
xml = etree.Element('View')
# headers
_ = etree.SubElement(xml, "Header", size='4', value=f'$lang_code')
_ = etree.SubElement(xml, "Header", size='4', value=f'$title')
_ = etree.SubElement(xml, "Header", size='4', value=f'$description')
_ = etree.SubElement(xml, 'Paragraphs', name="posts", value="$recent_posts", nameKey="type", textKey="text")
# primary
_ = etree.SubElement(xml, "Header", size="6",
                          value="Primary topics")
_ = etree.SubElement(xml, "Filter", name="filter_primary",
                      toName='primary', hotkey="shift+f",
                      minlength="1", )
primary = etree.SubElement(xml, "Choices", name="primary", toName="posts",
                          showInline="true", choice='multiple', required='true')
_ = [etree.SubElement(primary, 'Choice', value=s, )
     for s in TOPICS]
# secondary
_ = etree.SubElement(xml, "Header", size="6",
                          value="Secondary topics")
_ = etree.SubElement(xml, "Filter", name="filter_secondary",
                      toName='secondary',
                      minlength="1", )
secondary = etree.SubElement(xml, "Choices", name="secondary", toName="posts",
                          showInline="true", choice='multiple', required='false')
_ = [etree.SubElement(secondary, 'Choice', value=s, ) for s in TOPICS]
# pretty string
etree.indent(xml, space='  ')
config = etree.tostring(xml, pretty_print=True,).decode('utf-8')

In [98]:
with open(f'data/processed/labelling_config_{today_str}.xml', 'w') as f:
    f.write(config)

print(config)

<View>
  <Header size="4" value="$lang_code"/>
  <Header size="4" value="$title"/>
  <Header size="4" value="$description"/>
  <Paragraphs name="posts" value="$recent_posts" nameKey="type" textKey="text"/>
  <Header size="6" value="Primary topics"/>
  <Filter name="filter_primary" toName="primary" hotkey="shift+f" minlength="1"/>
  <Choices name="primary" toName="posts" showInline="true" choice="multiple" required="true">
    <Choice value="Art &amp; Design"/>
    <Choice value="Bets &amp; Gambling"/>
    <Choice value="Books"/>
    <Choice value="Business &amp; Entrepreneurship"/>
    <Choice value="Cars &amp; Other Vehicles"/>
    <Choice value="Celebrities &amp; Lifestyle"/>
    <Choice value="Cryptocurrencies"/>
    <Choice value="Culture &amp; Events"/>
    <Choice value="Curious Facts"/>
    <Choice value="Directories of Channels &amp; Bots"/>
    <Choice value="Drug Sale"/>
    <Choice value="Economy &amp; Finance"/>
    <Choice value="Education"/>
    <Choice value="Erotic Cont