# Decode 'langauge' feature

In [10]:
import sys
sys.path.append('..')

from utils.cuda_cluster import client
from utils.util import plot_graphs
import core.config as conf
from utils.dataset import read_data, factorize_small_cardinality

import numpy as np
from tqdm import tqdm
from datetime import datetime 
import matplotlib.pyplot as plt

import tensorflow as tf
from transformers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [8]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:39629  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.47 GB


In [90]:
#random seed 
tf.random.set_seed(1234)
np.random.seed(1234)

In [91]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", cache_dir='bert_ckpt', do_lower_case=False)

Downloading: 100%|██████████| 996k/996k [00:01<00:00, 845kB/s] 
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 40.2kB/s]
Downloading: 100%|██████████| 1.96M/1.96M [00:01<00:00, 1.52MB/s]


## 1. Load data

In [249]:
# data_path = conf.raw_data_path + '*' # for all dataset
data_path = conf.raw_data_path + 'part-00197'
ori_df = read_data(data_path)

number of rows: 3034025


In [250]:
# ori_df = ori_df[ori_df['tweet_type'] == 'TopLevel']
df = ori_df[['text_ tokens', 'language']]

In [251]:
df.head()

Unnamed: 0_level_0,text_ tokens,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,101\t12489\t112\t187\t20442\t55768\t85377\t101...,488B32D24BD4BB44172EB981C1BCA6FA
2,101\t56898\t137\t32916\t11359\t11359\t131\t199...,E7F038DE3EAD397AEC9193686C911677
3,101\t1413\t18427\t111434\t46856\t20503\t58264\...,9FCF19233EAD65EA6E32C2E6DC03A444
4,101\t56658\t11127\t18137\t66014\t30340\t12396\...,488B32D24BD4BB44172EB981C1BCA6FA
5,101\t14120\t131\t120\t120\t188\t119\t11170\t12...,313ECD3A1E5BB07406E4249475C2D6D6


In [254]:
df['cnt'] = 1

## Get unique language

In [270]:
languages_cnt = df.compute().groupby('language')[['cnt']].count().reset_index()
languages_tokens = df.compute().groupby('language')[['text_ tokens']].nth(-1).reset_index()
languages = languages_tokens.merge(languages_cnt, on=['language'], how='left')


In [273]:
languages = languages.sort_values('cnt', ascending=False)
languages['id'] = range(66)
languages = languages.set_index('id')

In [274]:
languages

Unnamed: 0_level_0,language,text_ tokens,cnt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,488B32D24BD4BB44172EB981C1BCA6FA,101\t18740\t11003\t98514\t10529\t12229\t18103\...,1096401
1,E7F038DE3EAD397AEC9193686C911677,101\t1972\t18825\t3425\t100\t14120\t131\t120\t...,520269
2,B0FA488F2911701DD8EC5B1EA5E322D8,101\t77603\t10165\t85912\t10125\t11639\t75366\...,253290
3,B8B04128918BBF54E2E178BFF1ABA833,101\t100\t108\t16062\t11281\t10115\t11274\t102...,242549
4,313ECD3A1E5BB07406E4249475C2D6D6,101\t77603\t10133\t136\t14120\t131\t120\t120\t...,194984
...,...,...,...
61,CDE47D81F953D800F760F1DE8AA754BA,101\t100\t14120\t131\t120\t120\t188\t119\t1117...,28
62,9D831A0F3603A54732CCBDBF291D17B7,101\t100\t14120\t131\t120\t120\t188\t119\t1117...,20
63,5F152815982885A996841493F2757D91,101\t100\t216\t137\t86545\t11090\t168\t12464\t102,7
64,82C9890E4A7FC1F8730A3443C761143E,101\t20384\t762\t58753\t92857\t11884\t100\t100...,1


In [275]:
tokens = list(map(lambda x: x.split('\t'), languages['text_ tokens'].to_array()))
decoded_tokens = []
n_languages = len(languages)
languages['language_id'] = range(n_languages)
for i in range(n_languages):
    decoded_tokens.append(tokenizer.decode(tokens[i], skip_special_tokens=True))

In [276]:
languages['decoded_tweet_tokens'] = decoded_tokens

In [277]:
languages

Unnamed: 0_level_0,language,text_ tokens,cnt,language_id,decoded_tweet_tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,488B32D24BD4BB44172EB981C1BCA6FA,101\t18740\t11003\t98514\t10529\t12229\t18103\...,1096401,0,Only three managers have presided over 9 - 0 w...
1,E7F038DE3EAD397AEC9193686C911677,101\t1972\t18825\t3425\t100\t14120\t131\t120\t...,520269,1,よく 寝 https : / / t. co / SrGk5Z8naN
2,B0FA488F2911701DD8EC5B1EA5E322D8,101\t77603\t10165\t85912\t10125\t11639\t75366\...,253290,2,"Ayer terminé el rewatch de gilmore girls, por ..."
3,B8B04128918BBF54E2E178BFF1ABA833,101\t100\t108\t16062\t11281\t10115\t11274\t102...,242549,3,# AliceInBorderland atingiu mais de 18 milhões...
4,313ECD3A1E5BB07406E4249475C2D6D6,101\t77603\t10133\t136\t14120\t131\t120\t120\t...,194984,4,Ayo? https : / / t. co / fxlP7Ukhsc
...,...,...,...,...,...
61,CDE47D81F953D800F760F1DE8AA754BA,101\t100\t14120\t131\t120\t120\t188\t119\t1117...,28,61,https : / / t. co / gZnIpoj76j
62,9D831A0F3603A54732CCBDBF291D17B7,101\t100\t14120\t131\t120\t120\t188\t119\t1117...,20,62,https : / / t. co / 2zgL6xUiIS
63,5F152815982885A996841493F2757D91,101\t100\t216\t137\t86545\t11090\t168\t12464\t102,7,63,¶ @ TXT _ members
64,82C9890E4A7FC1F8730A3443C761143E,101\t20384\t762\t58753\t92857\t11884\t100\t100...,1,64,بۇ ئاياللار ، بىز چوقۇم بۇ يولىدا ، ئۇيغۇر ،. ...


In [284]:
for i in range(10, 20):
    print(i, decoded_tokens[i])
    print()

10 INI GUE MAU UP AU GAJADI KAYAKNYA

11 RT @ PankajPuniaINC : खबरदार जो किसी विदेशी ने हमारे राजा जी की तारीफ की तो! ये हमारा आंतरिक मामला है!

12 ну или « чувак, которому кот откусил жопу »

13 Parang naiiyak c Nate?

14 RT @ catk8o : prima dei hyyh yoonkook i gay non esistevano

15 خب خوشگلا جزء مورد علاقهتون در قرآن کدومه ؟

16 RT @ FKrursel : Guter Ansatz um # Pflege mehr Kompetenzen zu geben.... Onkologische Pflegevisite auch ein Mehrwert für Patienten und deren Zug

17 RT @ polvroidx : pamiętajcie, że REWOLUCJA JEST KOBIETĄ # StrajkKobiet https : / / t. co / 6Ka14USPz9

18 کشمیرکانفرنس میں مقبوضہ کشمیرکے مستقل اورپرامن حل کیلئےتبادلہ خیال ہوا ۔ مسئلہ کشمیرکو حل کیے بغیرجنوبی ایشیاءمیں امن کاقیام ناممکن ہے ۔ مسئلہ کشمیرکا فوری حل حکومت کی اولین ترجیح ہے ۔ اقوام متحدہ اورعالمی برادری سےاپیل ہےکہ وادی کشمیر میں جاری انسانیت سوزمظالم کو رکوانےمیں کرداراداکرے https : / / t. co / nIft6F4mCS

19 Joël tweette zonet zijn bitcoin sleutel. https : / / t. co / PfddaLfyhX



In [246]:
language_type = [
    '영어', # 0
    '일본어', # 1
    '스페인어', # 2
    '포루투갈어', # 3
    '', # 4
    '터키어', # 5
    '태국어', # 6
    '아랍어', # 7
    '한국어', # 8
    '프랑스어', # 9
    '인도네시아어', # 10
    '힌디어', # 11
    '러시아어', # 12
    '타갈로그어', # 13
    '이탈리아어', # 14
    '', # 15
    '영어', # 16
    '말라얄람어', # 17
    '우크라이나어', # 18
    '?', # 19
    '히브리어', # 20
    '덴마크어', # 21
    '?', # 22
    '?', # 23
    '?', # 24
    '페르시아어', # 25
    '조지아어', # 26
    '벵골어', # 27
    '구자라트어', # 28
    '말라얄람어', # 29
    '힌디어', # 30
    '네덜란드어', # 31
    '?', # 32
    '말레이어', # 33
    '?페르시아어', # 34
    '타밀어', # 35
    '한국어', # 36
    '?', # 37
    '불가리아어', # 38
    '?', # 39
    '아랍어', # 40
    '?', # 41
    '태국어', # 42
    '칸나다어', # 43
    '중국어', # 44
    '스페인어', # 45
    '힌디어', # 46
    '포루투갈어', # 47
    '아프리칸스어', # 48
    '세르비아어', # 49
    '힌디어', # 50
    '펀잡어', # 51
    '?', # 52
    '슬로베니아어', # 53
    '텔루구어', # 54
    '스페인어', # 55
    '폴란드어', # 56
    '신디어', # 57
    '베트남어', # 58
    '프랑스어 ', # 59
    '중국어', # 60
    '스웨덴어', # 61
    '?', # 62
    '?', # 63
    '파슈토어', # 64
    '', # 65
    


]

In [247]:
n_languages

65