In [2]:
import re
import sys
import json
import string
import random
import unicodedata
import numpy as np
import pandas as pd
from collections import defaultdict

In [3]:
df = pd.read_csv("data/sentences_50k.csv", delimiter=",", encoding='utf8', index_col=0)
df.sample(10)

Unnamed: 0,id,lan_code,sentence
1806871,1911676,fra,Elle a l'air d'un garçon.
6241602,6622819,deu,"„Vati, Tom sagte, dass der Mensch vom Affen ab..."
4128622,4395888,mkd,Дали гледа накај мене Том?
9152798,9595621,kab,Zzenz meqqar imsismeḍ-nni.
4968159,5303231,deu,Tom wäre beinahe von einem Fahrrad angefahren ...
5883927,6257273,eng,I don't regret doing what I did.
1876418,1985301,heb,אני יודע מה הוא עשה.
654951,685718,nld,Ik heb dat liedje al gehoord.
4218997,4493156,por,Tom nem tentou beijar a Mary.
5246918,5597834,ita,Nessuno ci riesce a sentire.


# Statistical approach to language detection

In [4]:
# TODO: 
# Load and interpret symbols as numbers (not neccesary)
# Get all 1/2/3 combinations of letters/numbers and count them
# Make probability of it
# Create dictionary and save for later use

In [5]:
print(f"Number of rows: {len(df.index)}")

Number of rows: 9453483


In [6]:
tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
tbl[19968] = None

chinese_punctuation = "[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\uff01]"

def remove_punctuation(text):
    text = text.strip().lower().translate(tbl)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", "", text)
    return re.sub(chinese_punctuation, "", text)

In [7]:
df['sentence'] = df['sentence'].apply(remove_punctuation)

In [8]:
df.sample(10)

Unnamed: 0,id,lan_code,sentence
3754408,3996484,fra,que ditesvous daller prendre un verre
9560966,10007626,rus,том сорвал с неё яблоко
5617424,5971770,rus,удивительно но я согласен с томом
1331182,1408695,tur,elimden geldiğince kısa sürede orada olacağım
5140910,5489915,rus,вы думаете я этого хочу
1860205,1968242,deu,ich übe keinerlei bezahlte tätigkeit aus
4071958,4335275,mkd,ајде во град
6950813,7342266,ber,ččet tajilbantnwen
3929690,4183496,tur,burada çalışmak istemiyorum
5463763,5816756,tur,umarım serseri bir kurşunla vurulursun


In [9]:
def sentence_to_unicode_list(sentence):
    unicode_list = [ord(i) for i in sentence]
    return np.array(unicode_list).astype(np.int32)

In [10]:
df['unicode'] = df['sentence'].apply(sentence_to_unicode_list)

In [11]:
df.sample(10)

Unnamed: 0,id,lan_code,sentence,unicode
3836994,4086113,ita,lui sta per risolverlo,"[108, 117, 105, 32, 115, 116, 97, 32, 112, 101..."
1420715,1503177,spa,su meta es ser abogado,"[115, 117, 32, 109, 101, 116, 97, 32, 101, 115..."
6324588,6706896,spa,a quién espera,"[97, 32, 113, 117, 105, 233, 110, 32, 101, 115..."
1184625,1248886,por,ela está escrevendo um livro agora,"[101, 108, 97, 32, 101, 115, 116, 225, 32, 101..."
5619082,5973438,fin,ihan totta,"[105, 104, 97, 110, 32, 116, 111, 116, 116, 97]"
6018652,6397339,tur,tom marynin üç sandviç yediğine şaşırdı,"[116, 111, 109, 32, 109, 97, 114, 121, 110, 10..."
9698358,10146499,deu,wie viel weißt du,"[119, 105, 101, 32, 118, 105, 101, 108, 32, 11..."
8391386,8820972,ber,ttxemmiment ttxemmiment almi i dufant yiwet n ...,"[116, 116, 120, 101, 109, 109, 105, 109, 101, ..."
6186253,6566243,epo,por povi esti senriproĉa ano de ŝafaro oni ant...,"[112, 111, 114, 32, 112, 111, 118, 105, 32, 10..."
1279100,1351566,pol,koszty życia poszybowały w górę w ciągu ostatn...,"[107, 111, 115, 122, 116, 121, 32, 380, 121, 9..."


In [12]:
# TODO:
# drop all punctuation - ok
# do not take spaces into account - ok
# some optimalization (?)

In [13]:
counts = {}

for lang in df["lan_code"].unique():
    counts[lang] = defaultdict(int)

In [14]:
# ~6 minutes runtime if all data is passed

i = 0

for row in df.itertuples(index=False):
    language = row[1]
    sentence = row[2]
    
    for letter in sentence:
        if letter == " ":
            continue
        
        counts[language][letter] += 1
    
    for idx in range(0, len(sentence) - 1):
        letters = sentence[idx:idx+2]
        
        if " " in letters:
            continue
        
        counts[language][letters] += 1
    
    for idx in range(0, len(sentence) - 2):
        letters = sentence[idx:idx+3]
        
        if " " in letters:
            continue
        
        counts[language][letters] += 1
        
    # if i == 100:
    #     break
    
    # i += 1

In [15]:
df.head(10)

Unnamed: 0,id,lan_code,sentence,unicode
0,1,cmn,我們試試看,"[25105, 20497, 35430, 35430, 30475]"
1,2,cmn,我该去睡觉了,"[25105, 35813, 21435, 30561, 35273, 20102]"
2,3,cmn,你在干什麼啊,"[20320, 22312, 24178, 20160, 40636, 21834]"
3,4,cmn,這是什麼啊,"[36889, 26159, 20160, 40636, 21834]"
4,5,cmn,今天是６月１８号也是muiriel的生日,"[20170, 22825, 26159, 65302, 26376, 65297, 653..."
5,6,cmn,生日快乐muiriel,"[29983, 26085, 24555, 20048, 109, 117, 105, 11..."
6,7,cmn,muiriel现在20岁了,"[109, 117, 105, 114, 105, 101, 108, 29616, 223..."
7,8,cmn,密码是muiriel,"[23494, 30721, 26159, 109, 117, 105, 114, 105,..."
8,9,cmn,我很快就會回來,"[25105, 24456, 24555, 23601, 26371, 22238, 20358]"
9,10,cmn,我不知道,"[25105, 19981, 30693, 36947]"


In [16]:
print(counts)

{'cmn': defaultdict(<class 'int'>, {'我': 41, '們': 3, '試': 2, '看': 4, '我們': 1, '們試': 1, '試試': 1, '試看': 1, '我們試': 1, '們試試': 1, '試試看': 1, '该': 1, '去': 3, '睡': 2, '觉': 1, '了': 10, '我该': 1, '该去': 1, '去睡': 2, '睡觉': 1, '觉了': 1, '我该去': 1, '该去睡': 1, '去睡觉': 1, '睡觉了': 1, '你': 12, '在': 5, '干': 1, '什': 8, '麼': 7, '啊': 2, '你在': 1, '在干': 1, '干什': 1, '什麼': 7, '麼啊': 2, '你在干': 1, '在干什': 1, '干什麼': 1, '什麼啊': 2, '這': 6, '是': 29, '這是': 1, '是什': 1, '這是什': 1, '是什麼': 1, '今': 1, '天': 5, '６': 1, '月': 1, '１': 1, '８': 1, '号': 1, '也': 3, 'm': 4, 'u': 4, 'i': 8, 'r': 4, 'e': 4, 'l': 4, '的': 30, '生': 7, '日': 2, '今天': 1, '天是': 1, '是６': 1, '６月': 1, '月１': 1, '１８': 1, '８号': 1, '号也': 1, '也是': 2, '是m': 2, 'mu': 4, 'ui': 4, 'ir': 4, 'ri': 4, 'ie': 4, 'el': 4, 'l的': 1, '的生': 1, '生日': 2, '今天是': 1, '天是６': 1, '是６月': 1, '６月１': 1, '月１８': 1, '１８号': 1, '８号也': 1, '号也是': 1, '也是m': 1, '是mu': 2, 'mui': 4, 'uir': 4, 'iri': 4, 'rie': 4, 'iel': 4, 'el的': 1, 'l的生': 1, '的生日': 1, '快': 4, '乐': 1, '日快': 1, '快乐': 1, '乐m': 1, '生日快': 1, '日快乐': 1,

In [None]:
with open('data/counts_lang_wise.json', 'w') as file:
    json.dump(counts, file)

In [78]:
def counts2probability(counts: dict):
    probabilities = {}
    
    for lang, lang_counts in counts.items():
        for symbol_key, symbol_count in lang_counts.items():
            if symbol_key not in probabilities.keys():
                probabilities[symbol_key] = {}
                
            probabilities[symbol_key][lang] = symbol_count
    
    for symbol_key, symbol_count in probabilities.items():
        total_count = np.sum([sym_count for sym_count in symbol_count.values()])
        
        for lang, lang_counts in symbol_count.items():
            probabilities[symbol_key][lang] = lang_counts / total_count
            
    return probabilities            

In [79]:
probabilities = counts2probability(counts)

In [80]:
print(probabilities)

{'我': {'cmn': 1.0}, '們': {'cmn': 1.0}, '試': {'cmn': 1.0}, '看': {'cmn': 1.0}, '我們': {'cmn': 1.0}, '們試': {'cmn': 1.0}, '試試': {'cmn': 1.0}, '試看': {'cmn': 1.0}, '我們試': {'cmn': 1.0}, '們試試': {'cmn': 1.0}, '試試看': {'cmn': 1.0}, '该': {'cmn': 1.0}, '去': {'cmn': 1.0}, '睡': {'cmn': 1.0}, '觉': {'cmn': 1.0}, '了': {'cmn': 1.0}, '我该': {'cmn': 1.0}, '该去': {'cmn': 1.0}, '去睡': {'cmn': 1.0}, '睡觉': {'cmn': 1.0}, '觉了': {'cmn': 1.0}, '我该去': {'cmn': 1.0}, '该去睡': {'cmn': 1.0}, '去睡觉': {'cmn': 1.0}, '睡觉了': {'cmn': 1.0}, '你': {'cmn': 1.0}, '在': {'cmn': 1.0}, '干': {'cmn': 1.0}, '什': {'cmn': 1.0}, '麼': {'cmn': 1.0}, '啊': {'cmn': 1.0}, '你在': {'cmn': 1.0}, '在干': {'cmn': 1.0}, '干什': {'cmn': 1.0}, '什麼': {'cmn': 1.0}, '麼啊': {'cmn': 1.0}, '你在干': {'cmn': 1.0}, '在干什': {'cmn': 1.0}, '干什麼': {'cmn': 1.0}, '什麼啊': {'cmn': 1.0}, '這': {'cmn': 1.0}, '是': {'cmn': 1.0}, '這是': {'cmn': 1.0}, '是什': {'cmn': 1.0}, '這是什': {'cmn': 1.0}, '是什麼': {'cmn': 1.0}, '今': {'cmn': 1.0}, '天': {'cmn': 1.0}, '６': {'cmn': 1.0}, '月': {'cmn': 1.0}, '１': {'

In [81]:
with open('data/probabilities.json', 'w') as file:
    json.dump(probabilities, file)

In [None]:
# TODO: 
# iterate over 1, 2, 3 symbols
# check probabilities for their occurance
# add to lang_probability all symbols and their lang probabilities
# exclude languages which are not present in all symbols
# multiply and return

In [85]:
def detect_language_statistically(probabilities: dict, sentence: str):
    sentence = remove_punctuation(sentence)
    
    lang_probability = probabilities[sentence[0]]
    
    for letter in sentence[1:]:
        if letter == " ":
            continue
        
        for lang, probability in probabilities[letter].items():
            if lang not in lang_probability.keys():
                continue
            
            lang_probability[lang] = lang_probability[lang] * probability
    
    for idx in range(0, len(sentence) - 1):
        letters = sentence[idx:idx+2]
        
        if " " in letters:
            continue
        
        for lang, probability in probabilities[letters].items():
            if lang not in lang_probability.keys():
                continue
            
            lang_probability[lang] = lang_probability[lang] * probability
    
    for idx in range(0, len(sentence) - 2):
        letters = sentence[idx:idx+3]
        
        if " " in letters:
            continue
        
        for lang, probability in probabilities[letters].items():
            if lang not in lang_probability.keys():
                continue
            
            lang_probability[lang] = lang_probability[lang] * probability
        
        
    prob_sum = np.sum([value for value in lang_probability.values()])
    
    for key, value in lang_probability.items():
        lang_probability[key] = value/prob_sum
        
    return lang_probability
        

In [86]:
sentence = "die"

print(detect_language_statistically(probabilities, sentence))

{'deu': 1.0}
