In [2]:
in_f = open('data.csv', encoding='utf-8')
lines = in_f.readlines()
in_f.close()

# （文本），（语种）
# 包含English, French, German, Spanish, Italian 和 Dutch 6种语言
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]

dataset[:5]

[('1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme',
  'nl'),
 ('1 millón de afectados ante las inundaciones en sri lanka unicef está distribuyendo ayuda de emergencia srilanka',
  'es'),
 ('1 millón de fans en facebook antes del 14 de febrero y paty miki dani y berta se tiran en paracaídas qué harías tú porunmillondefans',
  'es'),
 ('1 satellite galileo sottoposto ai test presso lesaestec nl galileo navigation space in inglese',
  'it'),
 ('10 der welt sind bei', 'de')]

In [3]:
from sklearn.model_selection import train_test_split

# 训练测试集划分
x, y = list(zip(*dataset))
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [6]:
x_train[:3]

['io non ho ipad ma mi sa che è fatta un po meglio sfrutta meglio la superficie',
 'wp deutsche version auch schon da wordpress 304 important security update',
 'vielleicht ist er ja auch vor weihnachten artig wunschliste']

In [8]:
y_train[:3]

['it', 'de', 'de']

In [10]:
import re

def remove_noise(document):
    """正则表达式去噪"""
    noise_pattern = re.compile('|'.join(["http\S+", "\@\w+", "\#\w+"]))
    clean_text = re.sub(noise_pattern, '', document)
    return clean_text.strip()

remove_noise("Trump images are now more popular than cat gifs. @trump #trends http://www.trumptrends.html")

'Trump images are now more popular than cat gifs.'

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# 抽取1-gram和2-gram的统计特征
vec = CountVectorizer(
    lowercase=True,     # lowercase the text
    analyzer='char_wb', # tokenise by character ngrams
    ngram_range=(1,2),  # use ngrams of size 1 and 2
    max_features=1000,  # keep the most common 1000 ngrams
    preprocessor=remove_noise
)
vec.fit(x_train)

def get_features(x):
    vec.transform(x)

In [13]:
# 训练
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
classifier.score(vec.transform(x_test), y_test)

0.9770621967357741

In [18]:
vec.transform(x_test)[0].todense()

matrix([[12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
          0,  0,  0,  0,  2,  0,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0,
          1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,
          1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

# 整体流程

In [19]:
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


class LanguageDetector():

    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(ngram_range=(1,2), max_features=1000, preprocessor=self._remove_noise)

    def _remove_noise(self, document):
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)

In [20]:
in_f = open('data.csv', encoding='utf-8')
lines = in_f.readlines()
in_f.close()

dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]

x, y = list(zip(*dataset))
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict('This is an English sentence'))
print(language_detector.score(x_test, y_test))

['en']
0.9770621967357741
