# Statistical approach to language detection

In [1]:
import re
import sys
import json
import copy
import unicodedata
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

In [2]:
df = pd.read_csv("data/sentences_10k_balanced.csv", delimiter=",", encoding='utf8', index_col=0)
df.sample(10)

Unnamed: 0,id,lan_code,sentence
7996272,8413615,deu,Der Klimawandel begünstigt die Neozoenausbreit...
3817542,4065402,hun,Ezt a filmet már legalább háromszor láttam.
3882481,4133904,mkd,Цел ден ми помина во чистење снег.
4857951,5183605,jbo,lo zdani be la tom cu se pagbu su'o zabna purdi
156913,162836,jpn,私の父は国内線のパイロットです。
6301864,6683826,ita,Tom probabilmente non è ancora affamato.
3390375,3610161,spa,"El anciano probó el bálsamo con su lengua, y l..."
9894332,10343976,tlh,jatlhtaHvIS QIch wab Ho'DoS Sar nov qolbe'.
334155,349660,cmn,他在房子的周围看了看。
9497032,9943078,deu,Warum ist Tom nur so klug!


In [3]:
print(f"Number of rows: {len(df.index)}")

Number of rows: 1788802


In [4]:
tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
tbl[19968] = None

chinese_punctuation = "[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\uff01]"

def remove_punctuation(text):
    text = text.strip().lower().translate(tbl)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", "", text)
    return re.sub(chinese_punctuation, "", text)

In [5]:
df['sentence'] = df['sentence'].apply(remove_punctuation)

In [6]:
df.sample(10)

Unnamed: 0,id,lan_code,sentence
3544481,3771771,tur,keşke daha iyi şartlar altında görüşebilseydik
635381,665262,srp,on uvek nosi tamnu odeću
10313927,10766530,pes,او صفت‌های خوب متعددی دارد
5056846,5398759,tat,яман үрнәк йогышлы
4719442,5032142,lat,lingua arabica maximi momenti est
3506273,3731612,eng,where are those prisoners
3721503,3961683,ita,continuò a leggere il libro come se non fosse ...
9114359,9556722,kab,ur teɛjil ara ɣer tamuqra
365896,382326,isl,hann ræktaði tómataplöntur úr fræjum
4820363,5142378,rus,у тома есть выбор


In [7]:
counts = {}

for lang in df["lan_code"].unique():
    counts[lang] = defaultdict(int)

In [8]:
# long runtime

for row in df.itertuples(index=False):
    language = row[1]
    sentence = row[2]
    
    for letter in sentence:
        if letter == " ":
            continue
        
        counts[language][letter] += 1
    
    for idx in range(0, len(sentence) - 1):
        letters = sentence[idx:idx+2]
        
        if " " in letters:
            continue
        
        counts[language][letters] += 1
    
    for idx in range(0, len(sentence) - 2):
        letters = sentence[idx:idx+3]
        
        if " " in letters:
            continue
        
        counts[language][letters] += 1

In [11]:
print(len(counts))

48


In [9]:
df.head(10)

Unnamed: 0,id,lan_code,sentence
0,1,cmn,我們試試看
1,2,cmn,我该去睡觉了
2,3,cmn,你在干什麼啊
4,5,cmn,今天是６月１８号也是muiriel的生日
6,7,cmn,muiriel现在20岁了
7,8,cmn,密码是muiriel
9,10,cmn,我不知道
10,11,cmn,我不知道應該說什麼才好
11,12,cmn,這個永遠完不了了
12,13,cmn,我只是不知道應該說什麼而已


In [10]:
with open('data/counts_lang_wise_expanded.json', 'w') as file:
    json.dump(counts, file)

In [15]:
def counts2probability(counts: dict):
    probabilities = {}
    
    for lang, lang_counts in counts.items():
        total_lang_count = np.sum([lang_count for lang_count in lang_counts.values()])
        one_sym_lang_count = np.sum([lang_count for symbol, lang_count in lang_counts.items() if len(symbol) == 1])
        two_sym_lang_count = np.sum([lang_count for symbol, lang_count in lang_counts.items() if len(symbol) == 2])
        three_sym_lang_count = np.sum([lang_count for symbol, lang_count in lang_counts.items() if len(symbol) == 3])
        sym_len_compensation = [one_sym_lang_count, two_sym_lang_count, three_sym_lang_count]
        
        for symbol_key, symbol_count in lang_counts.items():
            if symbol_key not in probabilities.keys():
                probabilities[symbol_key] = {}
            
            # probabilities[symbol_key][lang] = symbol_count / total_lang_count * (len(symbol_key) * sym_len_compensation)
            probabilities[symbol_key][lang] = symbol_count / sym_len_compensation[len(symbol_key) - 1]

    for symbol_key, symbol_count in probabilities.items():
        total_sym_count = np.sum([sym_count for sym_count in symbol_count.values()])
        
        for lang, lang_counts in symbol_count.items():
            probabilities[symbol_key][lang] = lang_counts / total_sym_count
            
    return probabilities            

In [16]:
probabilities = counts2probability(counts)

In [17]:
print(len(probabilities))

759319


In [18]:
with open('data/probabilities_expanded.json', 'w') as file:
    json.dump(probabilities, file)

In [41]:
def detect_language_statistically(probabilities: dict, sentence: str):
    sentence = remove_punctuation(sentence)
    
    symbols = list()
    
    for letter in sentence[1:]:
        if letter != " ":
            symbols.append(letter)
        
    for idx in range(0, len(sentence) - 1):
        if " " not in sentence[idx:idx+2]:
            symbols.append(sentence[idx:idx+2])
            
    for idx in range(0, len(sentence) - 2):
        if " " not in sentence[idx:idx+3]:
            symbols.append(sentence[idx:idx+3])
    
    lang_probability = probabilities[sentence[0]]
        
    for symbol in tqdm(symbols, desc="Calculating language"):
        lang_probability_temp = copy.deepcopy(lang_probability)
        
        for lang_key in lang_probability.keys():
            if lang_key not in probabilities[symbol].keys():
                lang_probability_temp.pop(lang_key, None)
        
        lang_probability = copy.deepcopy(lang_probability_temp)
        del lang_probability_temp
        
        for lang, probability in probabilities[symbol].items():
            if lang not in lang_probability.keys():
                continue
            
            lang_probability[lang] = lang_probability[lang] * probability
        
    prob_sum = np.sum([value for value in lang_probability.values()])
    
    for key, value in lang_probability.items():
        lang_probability[key] = value/prob_sum
        
    return lang_probability
        

In [42]:
with open("data/lan_to_language.json", "r") as json_file:
    lan2lang = json.load(json_file)

In [48]:
sentence = "ragazzo"

prob_lang = detect_language_statistically(probabilities, sentence)

print(f"Detected language for '{sentence}': {lan2lang[max(prob_lang, key=prob_lang.get)]}")
print(f"All languages' probabilities: {dict(sorted(prob_lang.items(), key=lambda item: item[1], reverse=True))}")

Calculating language: 100%|██████████| 17/17 [00:00<00:00, 8502.64it/s]

Detected language for 'ragazzo': Hungarian
All languages' probabilities: {'hun': 0.8178772781185304, 'ita': 0.18212272188143572, 'swe': 3.037131505662526e-14, 'ina': 3.726890418765009e-15, 'ces': 1.0632877669017078e-17}



