### Анастасия Костяницына

### БКЛ-151

In [391]:
import wikipedia
import codecs
import collections
import sys
import numpy as np
import os
import json
import re
import requests
import warnings
warnings.filterwarnings("ignore")

In [392]:
from collections import defaultdict
from string import punctuation, digits

punctuation = set(punctuation + '«»—…“”\n\t' + digits)
table = str.maketrans({ch: None for ch in punctuation})

#### Разработка алгоритмов

In [393]:
def get_texts_for_lang(lang, n=10): # функция для скачивания статей из википедии
    wikipedia.set_lang(lang)
    wiki_content = []
    pages = wikipedia.random(n)
    for page_name in pages:
        try:
            page = wikipedia.page(page_name)
        except wikipedia.exceptions.WikipediaException:
            print('Skipping page {}'.format(page_name))
            continue

        wiki_content.append('{}\n{}'.format(page.title, page.content.replace('==', '')))

    return wiki_content

Cкачиваем по 100 статей для каждого языка.

In [394]:
code2lang = wikipedia.languages()

Так как в code2lang всего 432 языка, этот процесс занимает очень много времени, поэтому ради эксперемента будем использовать лишь некоторые. 

In [395]:
langs = ['ru', 'mk', 'bg', 'ky', 'en', 'mn', 'fr', 'be', 'uk', 'lez', 'mhr', 'kk']

In [396]:
wiki_texts = {}

for lang in langs:
    try:
        wiki_texts[lang] = get_texts_for_lang(lang, 150)
    except Exception as e:
        print('ERROR ON - ', lang, e)
        continue
    print(lang, len(wiki_texts[lang]))

Skipping page Иммануэль
Skipping page Ткачук, Михаил
Skipping page Философов
Skipping page Улица Ильича
Skipping page Извольский
Skipping page Межиречка
Skipping page Мухинский сельсовет
Skipping page Константиновская волость
Skipping page Рёч
Skipping page Озолс
ru 140
mk 150
Skipping page Трентън (пояснение)
Skipping page Полемон
Skipping page Обсада на Никея
Skipping page Иван Стоянов
Skipping page Кирил Стоянов (пояснение)
Skipping page Брей
bg 144
ky 150
Skipping page Dema
Skipping page Bridal Veil Falls (Banff)
Skipping page Bishop Smith
Skipping page All Things to All Men (song)
Skipping page Charles H. Jones
Skipping page CCGA
Skipping page Darj-e Sofla
en 143
mn 150
Skipping page Centre de contrôle
Skipping page Felici
Skipping page Callahan
Skipping page Église Saint-Jean-l'Évangéliste de Taulis
fr 146
Skipping page Мазкі
Skipping page Аптычная вось
Skipping page Мелхія (значэнні)
Skipping page Багатыр
Skipping page Падлужжа (Глускі раён)
Skipping page Благавешчанск (значэнні

In [397]:
test_texts = {}

for lang in langs:
    try:
        test_texts[lang] = get_texts_for_lang(lang, 100)
    except Exception as e:
        print('ERROR ON - ', lang, e)
        continue
    print(lang, len(test_texts[lang]))

Skipping page Беньямин
Skipping page Бартошек (значения)
Skipping page Мысовское (сельское поселение)
Skipping page Лейбман
Skipping page Кочегуровка
ru 95
mk 100
Skipping page Жидов гроб
Skipping page Ломница
bg 98
ky 100
Skipping page L. nitida
Skipping page Gaber
Skipping page Is That So?
Skipping page Wilson Charles
Skipping page This Is Your Sword
Skipping page Sogpelcé
Skipping page Peter Hayman
en 93
mn 100
Skipping page Église Saint-Louis de Saint-Louis-lès-Bitche
Skipping page Kelmendi
Skipping page I Got You
Skipping page Ladino
Skipping page Anthony Powell
Skipping page Marakwet
Skipping page Sadovo
fr 93
Skipping page Зачэпічы
Skipping page Нільс (значэнні)
Skipping page Рыгор Сцяпанавіч Пірагоў
Skipping page Рэдзькі
be 96
Skipping page Помбал
Skipping page Верби
Skipping page Віана
Skipping page Лисун
uk 96
lez 100
Skipping page Рвезылык (ыҥ-влак)
Skipping page Пыжанъю (ыҥ-влак)
Skipping page Квадрат (ыҥ-влак)
Skipping page Ялнер
mhr 96
kk 100


### Первый метод: частотные слова

Формирование частотного списка

In [399]:
def key_words(wiki_texts, lang):
    
    freqs = collections.defaultdict(lambda: 0)
    
    length = 0

    try:
        
        corpus = wiki_texts[lang]
        
        for article in corpus:
            
            words = tokenize(article.replace('\n', '').lower())
            length += len(words)
            
            for word in words:
         
                freqs[word] += 1

        return freqs, length
            
    except:
        return freqs, 0

In [400]:
def tokenize(text):
    text = re.sub(r'[^\w\s]','',text).replace('\n', '')
    text = re.sub(r'[\s]{2,}',' ',text)
    return text.split(' ')


word_langs = defaultdict(dict)
lang_word = defaultdict(set)
corpus_length = {}


for lang in wiki_texts:
    
    freqs, length = key_words(wiki_texts, lang)
    
    a = defaultdict(set)

    for word in freqs:
        
        lang_word[lang].add(word)
        word_langs[word][lang] = freqs[word]
   
    corpus_length[lang] = length



word_langs: ключ - слово, значение - словарь для каждого языка, в котором встретилось слово и частотностью его употребления в тексте на этом языке.

In [401]:
corpus_length

{'be': 24253,
 'bg': 41927,
 'en': 74212,
 'fr': 72362,
 'kk': 15648,
 'ky': 22409,
 'lez': 21595,
 'mhr': 16090,
 'mk': 44745,
 'mn': 34273,
 'ru': 75300,
 'uk': 37334}

Получаем вероятности слов для кажого языка

In [402]:
import math

for word in word_langs:
    
    for lang in word_langs[word]:
        
        word_langs[word][lang] = word_langs[word][lang] / corpus_length[lang]

In [403]:
word_langs['я']

{'be': 8.246402506906362e-05,
 'bg': 0.0005485725189019009,
 'kk': 6.390593047034765e-05,
 'ky': 4.462492748449284e-05,
 'lez': 0.015790692289881916,
 'mhr': 0.00018645121193287757,
 'ru': 0.00021248339973439575,
 'uk': 0.00034820806771307654}

Предсказание языка

In [404]:
def predict_language(text, lang_word, word_langs):
    
    punctuation = '(\.|,|\?|!|\(|\)|\*|\'|\"|:|;|>|<|/|—|»|«|=|\{|\}|\[|\]|\-|_|\+|\&|\*|\^|\%|\$|@|\#|”)'
    text = re.sub(punctuation, ' ', text)
    words = set(tokenize(text.replace('\n', ',').lower()))
    
    lang_pred = defaultdict(lambda: 0)
    
    for lang in lang_word:

        intersect = words & lang_word[lang]
        
        for word in intersect:
            
            lang_pred[lang] += word_langs[word][lang]     

    return max(lang_pred.items(), key=lambda x: x[1])[0]

In [405]:
predict_language('что это: за! язык такой?', lang_word, word_langs)

'mk'

Тест

In [406]:
from sklearn.metrics import classification_report, confusion_matrix

In [407]:
true_labels = []
predicted_labels = []

for lang in test_texts:
    for text in test_texts[lang]:
        true_labels.append(lang)
        predicted_labels.append(predict_language(text, lang_word,  word_langs))

In [408]:
print(classification_report(true_labels, predicted_labels))

             precision    recall  f1-score   support

         be       0.99      0.92      0.95        96
         bg       0.66      1.00      0.79        98
         en       0.79      1.00      0.88        93
         fr       0.96      1.00      0.98        93
         kk       0.98      0.94      0.96       100
         ky       1.00      0.90      0.95       100
        lez       0.99      0.86      0.92       100
        mhr       0.94      0.98      0.96        96
         mk       0.90      1.00      0.95       100
         mn       1.00      0.94      0.97       100
         ru       0.85      0.56      0.68        95
         uk       1.00      0.77      0.87        96

avg / total       0.92      0.91      0.91      1167



In [409]:
labels = list(set(lang_word))
print(confusion_matrix(true_labels, predicted_labels))

[[ 88   2   3   1   0   0   0   0   1   0   1   0]
 [  0  98   0   0   0   0   0   0   0   0   0   0]
 [  0   0  93   0   0   0   0   0   0   0   0   0]
 [  0   0   0  93   0   0   0   0   0   0   0   0]
 [  1   1   1   1  94   0   0   0   1   0   1   0]
 [  0   2   2   0   1  90   1   2   1   0   1   0]
 [  0   0   7   1   0   0  86   3   0   0   3   0]
 [  0   0   2   0   0   0   0  94   0   0   0   0]
 [  0   0   0   0   0   0   0   0 100   0   0   0]
 [  0   0   4   0   1   0   0   1   0  94   0   0]
 [  0  33   4   1   0   0   0   0   4   0  53   0]
 [  0  13   2   0   0   0   0   0   4   0   3  74]]


### Второй метод: частотные символьные n-граммы

In [410]:
from itertools import islice, tee

def make_ngrams(text):
    N = 3 # задаем длину n-граммы
    ngrams = zip(*(islice(seq, index, None) for index, seq in enumerate(tee(text, N))))
    ngrams = [''.join(x) for x in ngrams]
    return ngrams

Подготовка списка частотных ngram для каждого языка 

In [411]:
def ngram_count(wiki_texts, lang):
    
    freqs = collections.defaultdict(lambda: 0)
    
    try:
        
        corpus = wiki_texts[lang]
        
        for article in corpus:
            
            article = re.sub(r'[^\w\s]', '', article).replace('\n', '')
            article = re.sub(r'[\s]{2,}', ' ', article)
            
            for ngram in make_ngrams(article.replace('\n', '').lower()):
                
                freqs[ngram] += 1
        
        freqs = sorted(freqs, key=lambda n: freqs[n], reverse=True)[:300]
        return freqs
        
    except:
        return freqs

In [412]:

all_ngram = defaultdict(list)

for lang in wiki_texts:
    
    freqs = ngram_count(wiki_texts, lang)
    all_ngram[lang] = freqs
    
    

Функция расстояния для двух веторов ngram

In [413]:
def distance(a, b):
    
    distance = 0
    x = {}
    y = {}
    
    for index, item in enumerate(a):
        
        x[item] = index

    for index, item in enumerate(b):
        
        y[item] = index
        
    for item in a:
        
        if item in y:
            
            if x[item] != y[item]:
                distance += abs(x[item] - y[item])
                
        else:
            distance += 299
            
    return distance

Предсказание языка

In [414]:
def predict_lang_ngram(text, all_ngram):
    
    punctuation = '(\.|,|\?|!|\(|\)|\*|\'|\"|:|;|>|<|/|—|»|«|=|\{|\}|\[|\]|\-|_|\+|\&|\*|\^|\%|\$|@|\#|”)'
    text = re.sub(punctuation, ' ', text)
    
    freqs = collections.defaultdict(lambda: 0)
    
    for ngram in make_ngrams(text.replace('\n', ' ').lower()):
        
        freqs[ngram] += 1
         
    freqs = sorted(freqs, key=lambda n: freqs[n], reverse=True)[:300]

    lang_pred = defaultdict(lambda: 0)

    
    for lang in all_ngram:
        lang_pred[lang] = distance(all_ngram[lang], freqs)
    
    return min(lang_pred.items(), key=lambda x: x[1])[0]

In [415]:
predict_lang_ngram('что это: за! язык такой?', all_ngram)

'ru'

Тест

In [416]:
true_labels = []
predicted_labels = []

for lang in test_texts:
    for text in test_texts[lang]:
        true_labels.append(lang)
        predicted_labels.append(predict_lang_ngram(text, all_ngram))

In [417]:
print(classification_report(true_labels, predicted_labels))

             precision    recall  f1-score   support

         be       1.00      0.99      0.99        96
         bg       0.98      0.99      0.98        98
         en       0.92      1.00      0.96        93
         fr       1.00      1.00      1.00        93
         kk       0.99      0.99      0.99       100
         ky       0.99      0.94      0.96       100
        lez       0.98      0.95      0.96       100
        mhr       0.98      0.97      0.97        96
         mk       0.98      0.99      0.99       100
         mn       1.00      0.99      0.99       100
         ru       0.98      0.99      0.98        95
         uk       1.00      1.00      1.00        96

avg / total       0.98      0.98      0.98      1167



In [418]:
print(confusion_matrix(true_labels, predicted_labels, labels = labels))

[[94  0  0  0  1  0  0  0  0  0  0  0]
 [ 0 99  1  0  0  0  0  0  0  0  0  0]
 [ 0  0 97  0  1  0  0  0  0  0  0  0]
 [ 1  2  1 94  0  0  0  0  0  1  1  0]
 [ 0  0  0  0 93  0  0  0  0  0  0  0]
 [ 0  0  0  0  1 99  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 93  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 96  0  0  0  0]
 [ 1  0  0  1  3  0  0  0 95  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 99  1  0]
 [ 0  0  0  0  1  0  0  0  2  0 93  0]
 [ 0  0  0  0  1  0  0  0  0  0  0 95]]


Как видно из результатов, второй способ работает намного лучше