### 用朴素贝叶斯完成语种检测

In [2]:
# 查看数据
# 来自twitter数据，包含English、French、German、Spanish、Italian和Dutch
with open('./data.csv', 'r', encoding='utf-8') as f:
    lines = f.readlines()

dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]

In [3]:
dataset[:5]

[('1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme',
  'nl'),
 ('1 millón de afectados ante las inundaciones en sri lanka unicef está distribuyendo ayuda de emergencia srilanka',
  'es'),
 ('1 millón de fans en facebook antes del 14 de febrero y paty miki dani y berta se tiran en paracaídas qué harías tú porunmillondefans',
  'es'),
 ('1 satellite galileo sottoposto ai test presso lesaestec nl galileo navigation space in inglese',
  'it'),
 ('10 der welt sind bei', 'de')]

In [5]:
# 将原数据分成训练集和测试集，使用sklearn自带的分割函数
from sklearn.model_selection import train_test_split
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [8]:
len(x_test)

2267

In [9]:
# 使用正则表达式去掉噪声数据
import re

def remove_noise(document):
    noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
    clean_text = re.sub(noise_pattern, "", document)
    return clean_text.strip()

remove_noise("Trump images are now more popular than cat gifs. @trump #trends http://www.trumptrends.html")

'Trump images are now more popular than cat gifs.'

下一步是在降噪数据上抽取出有用的特征，我们抽取1-gram和2-gram的统计特征

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
    lowercase=True,     # lowercase the text
    analyzer='char_wb', # tokenise by character ngrams
    ngram_range=(1,2),  # use ngrams of size 1 and 2
    max_features=1000,  # keep the most common 1000 ngrams
    preprocessor=remove_noise
)
vec.fit(x_train)

def get_features(x):
    vec.transform(x)

把分类器MultinomialNB给import进来训练

In [13]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)

MultinomialNB()

查看准确率如何

In [15]:
classifier.score(vec.transform(x_test), y_test)

0.9770621967357741

能在1500句话上，训练得到准确率97.7%的分类器，效果还是不错的。
如果加大语料，准确率会非常高。

**规范化，写成一个class**

In [17]:
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

class LanguageDetector():
    
    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(ngram_range=(1,2), max_features=1000, preprocessor=self._remove_noise)
    
    def _remove_noise(self, document):
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text
    
    def features(self, X):
        return self.vectorizer.transform(X)
    
    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)
    
    def predict(self, x):
        return self.classifier.predict(self.features([x]))
    
    def score(self, X, y):
        return self.classifier.score(self.features(X), y)

In [18]:
in_f = open('data.csv')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict('This is an English sentence'))
print(language_detector.score(x_test, y_test))

['en']
0.9770621967357741
