# Statistical approach to language detection

In [143]:
import re
import sys
import json
import copy
import unicodedata
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

In [144]:
df = pd.read_csv("data/sentences_10k_balanced.csv", delimiter=",", encoding='utf8', index_col=0)
df.sample(10)

Unnamed: 0,id,lan_code,sentence
6910919,7301935,spa,Tom sale a caminar todos los días.
6487226,6872642,tok,jan li kama lukin e mun luka tu lon poka pi su...
6665805,7054378,lat,"Qui gratum dat ave, responsum fertque suave."
3280966,3494004,ind,Tom hampir dipecat dari pekerjaan.
8609110,9042616,nld,De dokter zal hem onderzoeken.
2946051,3122632,ita,Questo piatto si sposa molto bene con il sake.
8135194,8554319,por,Não consigo entender muitas de suas ações.
3439847,3662405,hin,मैं यूनान से हूँ।
3784454,4029465,tur,Benim zamanımda gerçek oyuncular vardı.
7596749,8004586,bel,Ён у цябе на стале.


In [145]:
print(f"Number of rows: {len(df.index)}")

Number of rows: 2023529


In [146]:
tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
tbl[19968] = None

chinese_punctuation = "[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\uff01]"

def remove_punctuation(text):
    text = text.strip().lower().translate(tbl)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", "", text)
    return re.sub(chinese_punctuation, "", text)

In [147]:
df['sentence'] = df['sentence'].apply(remove_punctuation)

In [148]:
df.sample(10)

Unnamed: 0,id,lan_code,sentence
717324,751593,pol,wniósł kilka poprawek
7834972,8248641,ces,včera jsem nevečeřel
1249835,1320266,tlh,rav lam yilamhachohmoh
6911739,7302770,rus,я не храню твоих фотографий
4426361,4717937,jpn,ごめんって言ったじゃん
10222187,10673670,dan,en rose er en smuk blomst
8263830,8685755,hin,उसका वज़न बढ़ रहा है
2123970,2251775,dan,det er helt forståeligt
696030,729159,por,a rainha elizabeth faleceu no ano de
7388558,7791169,ukr,я доволі прогресивний


In [149]:
counts = {}

for lang in df["lan_code"].unique():
    counts[lang] = defaultdict(int)

In [150]:
# long runtime

for row in df.itertuples(index=False):
    language = row[1]
    sentence = row[2]
    
    for letter in sentence:
        if letter == " ":
            continue
        
        counts[language][letter] += 1
    
    for idx in range(0, len(sentence) - 1):
        letters = sentence[idx:idx+2]
        
        if " " in letters:
            continue
        
        counts[language][letters] += 1
    
    for idx in range(0, len(sentence) - 2):
        letters = sentence[idx:idx+3]
        
        if " " in letters:
            continue
        
        counts[language][letters] += 1

In [151]:
print(len(counts))

48


In [152]:
df.head(10)

Unnamed: 0,id,lan_code,sentence
0,1,cmn,我們試試看
1,2,cmn,我该去睡觉了
2,3,cmn,你在干什麼啊
3,4,cmn,這是什麼啊
4,5,cmn,今天是６月１８号也是muiriel的生日
5,6,cmn,生日快乐muiriel
6,7,cmn,muiriel现在20岁了
7,8,cmn,密码是muiriel
8,9,cmn,我很快就會回來
9,10,cmn,我不知道


In [162]:
with open('data/counts_lang_wise_expanded_10k.json', 'w') as file:
    json.dump(counts, file)

In [154]:
# with open('data/counts_lang_wise_expanded.json', 'r') as file:
#     counts = json.load(file)

In [155]:
def counts2probability(counts: dict):
    probabilities = {}
    
    for lang, lang_counts in counts.items():
        total_lang_count = np.sum([lang_count for lang_count in lang_counts.values()])
        one_sym_count = np.sum([1 for symbol in lang_counts.keys() if len(symbol) == 1])
        sym_conut_compensation = [1, one_sym_count, one_sym_count**2]
        
        one_sym_lang_count = np.sum([lang_count for symbol, lang_count in lang_counts.items() if len(symbol) == 1])
        two_sym_lang_count = np.sum([lang_count for symbol, lang_count in lang_counts.items() if len(symbol) == 2])
        three_sym_lang_count = np.sum([lang_count for symbol, lang_count in lang_counts.items() if len(symbol) == 3])
        sym_len_compensation = [one_sym_lang_count, two_sym_lang_count, three_sym_lang_count]
        
        for symbol_key, symbol_count in lang_counts.items():
            if symbol_key not in probabilities.keys():
                probabilities[symbol_key] = {}
            
            probabilities[symbol_key][lang] = symbol_count / sym_len_compensation[len(symbol_key) - 1]

    for symbol_key, symbol_count in probabilities.items():
        total_sym_count = np.sum([sym_count for sym_count in symbol_count.values()])
        
        for lang, lang_counts in symbol_count.items():
            probabilities[symbol_key][lang] = lang_counts / total_sym_count
            
    return probabilities            

In [156]:
probabilities = counts2probability(counts)

In [157]:
print(len(probabilities))

846463


In [158]:
with open('data/probabilities_expanded_10k_second_method.json', 'w') as file:
    json.dump(probabilities, file)

In [159]:
def detect_language_statistically(probabilities: dict, sentence: str):
    sentence = remove_punctuation(sentence)
    
    symbols = list()
    
    for letter in sentence[1:]:
        if letter != " ":
            symbols.append(letter)
        
    for idx in range(0, len(sentence) - 1):
        if " " not in sentence[idx:idx+2]:
            symbols.append(sentence[idx:idx+2])
            
    for idx in range(0, len(sentence) - 2):
        if " " not in sentence[idx:idx+3]:
            symbols.append(sentence[idx:idx+3])
    
    lang_probability = probabilities[sentence[0]]
        
    for symbol in tqdm(symbols, desc="Calculating language"):
        lang_probability_temp = copy.deepcopy(lang_probability)
        
        for lang_key in lang_probability.keys():
            if lang_key not in probabilities[symbol].keys():
                lang_probability_temp.pop(lang_key, None)
        
        lang_probability = copy.deepcopy(lang_probability_temp)
        del lang_probability_temp
        
        for lang, probability in probabilities[symbol].items():
            if lang not in lang_probability.keys():
                continue
            
            lang_probability[lang] = lang_probability[lang] * probability
        
    prob_sum = np.sum([value for value in lang_probability.values()])
    
    for key, value in lang_probability.items():
        lang_probability[key] = value/prob_sum
        
    return lang_probability
        

In [160]:
with open("data/lan_to_language.json", "r") as json_file:
    lan2lang = json.load(json_file)

### Testing

In [169]:
sentence = "siemka"

prob_lang = detect_language_statistically(probabilities, sentence)

print(f"Detected language for '{sentence}': {lan2lang[max(prob_lang, key=prob_lang.get)]}\n")

print("All languages' probabilities:")

for lan, probability in dict(sorted(prob_lang.items(), key=lambda item: item[1], reverse=True)).items():
    print(f"{lan2lang[lan]}: {probability}")

Calculating language: 100%|██████████| 14/14 [00:00<00:00, 4665.89it/s]

Detected language for 'siemka': Polish

All languages' probabilities:
Polish: 0.9999991360791213
Esperanto: 4.992771164215207e-07
Czech: 2.3273961538653609e-07
Danish: 8.151018084378243e-08
Hungarian: 3.6908542725947506e-08
Swedish: 1.3485423385814715e-08
Mandarin Chinese: 9.685812208056373e-24





### Benchmark

In [2]:
# TODO...