In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import re

In [None]:
train_df = pd.read_csv('train.csv.zip')
test_df = pd.read_csv('test.csv.zip')

In [None]:
train_df.shape, test_df.shape

((3159633, 2), (2784634, 2))

In [None]:
test_df.sentence.head(60)

0                                            תודה לכם .
1     Precisamos de compaixão para começar , e auto-...
2                               這個增長相當大 ， 並且它將引發經濟的增長 。
3                    시애틀에서 자란 제가 처음 가난을 보게 되던 때를 기억해요 .
4                                       これをロボットに組み込みました
5                            所以他們拿了紅綠藍 ， 不是只拿訂單上的部分貨物 。
6                        Van , aki felelősségteljesen .
7     Geen enkele , want Disney is niet van plan om ...
8     Я хотел вас оставить сегодня с идеей , что есл...
9     Najuzbudljivija je Harlem dječja zona , koja v...
10    החוכמה של זקני השבט האלו היא לא סתם אוסף של סי...
11    Ma credo che con una campagna politica adeguat...
12        Dan itu adalah sesuatu yang mudah dilupakan .
13    Povestea ei te inspiră , deși este necunoscută...
14    I to jest właśnie moja wizja waszej przyszłości .
15    Ceva ca Regula 30 , de exemplu , pare să funcț...
16    És , ha ez igaz , és azt hiszem az , akkor az ...
17    ( Video ) ( Musik ) Nicholas : Mein Name i

In [None]:
#обработка апострофа в разных языках
def del_apos_en(text):
    return re.sub(' &apos;',"'", text)

def del_apos_fr(text):
    re.sub(' ’ ',"'", text)
    return re.sub('&apos; ',"'", text)

def del_apos_fr_ca(text):    
    return re.sub(' ’ ',"'", text)

def del_apos_uk(text):
    return re.sub(' &apos ',"'", text)

def del_apos_de(text):    
    return re.sub(' &apos; ',"'", text)

In [None]:
test_df.loc[:, 'sentence'] = test_df.sentence.apply(del_apos_fr)

In [None]:
train_df.loc[train_df['language'] =='en', 'sentence'] = train_df[train_df.language=='en'].sentence.apply(del_apos_en)

In [None]:
train_df.loc[train_df['language'] =='fr', 'sentence'] = train_df[train_df.language=='fr'].sentence.apply(del_apos_fr)

In [None]:
train_df.loc[train_df['language'] =='fr-ca', 'sentence'] = train_df[train_df.language=='fr-ca'].sentence.apply(del_apos_fr_ca)

In [None]:
train_df.loc[train_df['language'] =='uk', 'sentence'] = train_df[train_df.language=='uk'].sentence.apply(del_apos_uk)

In [None]:
train_df.loc[train_df['language'] =='de', 'sentence'] = train_df[train_df.language=='de'].sentence.apply(del_apos_de)

In [None]:
#убираем символы
def reg(text):
  prep = ['"', '»', '«', '&', '- ', '♫ ', '։', '？', '！',  '/', '#', '&quot', 'apos', 'quot', '&lt', '&gt', '&amp','一','—','）','（']
  new_text= text
  for i in prep:
    result = re.split(i, new_text)
    new_text = " ".join(token for token in result)
  return new_text

In [None]:
test_df.loc[:, 'sentence'] = test_df.sentence.apply(reg)

In [None]:
train_df.loc[:, 'sentence'] = train_df.sentence.apply(reg)

In [None]:
# Убираем числа и одиночные символы
regex = re.compile(u"[^0-9.,!?+=，();:%$}{]+")

def del_num(text, regex=regex):
    return " ".join(regex.findall(text))

In [None]:
train_df.loc[:, 'sentence'] = train_df.sentence.apply(del_num)

In [None]:
test_df.loc[:, 'sentence'] = test_df.sentence.apply(del_num)

In [None]:
#убираем аббревиатуры
def del_abb(text):
    return re.sub(r"\b[A-Z]{2,}\b", "", text)


In [None]:
train_df.loc[:, 'sentence'] = train_df.sentence.apply(del_abb)

In [None]:
test_df.loc[:, 'sentence'] = test_df.sentence.apply(del_abb)

In [None]:
#убираем лишние пробелы
def del_prep(text):
  return " ".join([token for token in text.split()])

In [None]:
train_df.loc[:, 'sentence'] = train_df.sentence.apply(del_prep)

In [None]:
test_df.loc[:, 'sentence'] = test_df.sentence.apply(del_prep)

In [None]:
#к нижнему регистру
train_df.sentence = train_df.sentence.str.lower()
test_df.sentence = test_df.sentence.str.lower()

In [None]:
not_latin = ['ar', 'he', 'ja', 'zh-tw', 'ko', 'fa', 'bg', 'th',
       'zh-cn', 'ku', 'el', 'uk', 'ka', 'ta', 'zh', 'my',
       'mn', 'mk', 'hy', 'hi', 'bn', 'mr', 'be', 'ur', 'kk']

In [None]:
#пробелы в начале и в конце
train_df.sentence = train_df.sentence.str.strip()
test_df.sentence = test_df.sentence.str.strip()

In [None]:
train_df.loc[train_df['sentence'] =='это что мы не как на вы они но из то он так для аплодисменты', 'language'] = 'ru'

In [None]:
train_df.loc[train_df['sentence'] =='гэта што калі мы не яны як але ён каб дзякуй больш пра на вы', 'language'] = 'uk'

In [None]:
train_df.loc[train_df['sentence'] =='що це тому ми чи дуже які щоб дякую якщо було вона від мене ось', 'language'] = 'uk'

In [None]:
train_df.loc[train_df['sentence'] =='笑聲 掌聲 謝謝 所以 現在 事實上 當然 因此 謝謝大家 對吧 但是 鼓掌 謝謝各位 他說 我說', 'language'] = 'zh-tw'

In [None]:
train_df.loc[train_df['sentence'] =='笑声 掌声 谢谢 现在 所以 事实上 当然 鼓掌 但是 因此 那么 非常感谢 谢谢大家 是的 好吧', 'language'] = 'zh-tw'

In [None]:
train_df.loc[train_df['sentence'] =='бұл мен біз бір үшін ол деп және емес бар керек бірақ олар қол осы', 'language'] = 'kk'

In [None]:
train_df.loc[train_df['sentence'] =='що це ми не як на вони та але ви до про він оплески коли', 'language'] = 'uk'

In [None]:
train_df.loc[train_df['sentence'] =='хвала вам', 'language'] = 'ru'

In [None]:
train_df.loc[train_df['sentence'] =='аплауз', 'language'] = 'mk'

In [None]:
train_df.loc[train_df['sentence'] =='the and of it that you to apos we this is in they so are', 'language'] = 'en'

In [None]:
train_df.loc[train_df['sentence'] =='taps', 'language'] = 'en'

In [None]:
#убираем английские слова из текстов
def del_en(text):
    return re.sub("[a-z]+", "", text)
    
for k in not_latin:
  train_df.loc[train_df['language'] ==k, 'sentence'] = train_df[train_df.language==k].sentence.apply(del_en)    

In [None]:
#удаляем пустые строки
train_df = train_df.drop(np.where(train_df['sentence'] =='')[0])

In [None]:
lang_count = (
    train_df.language.value_counts()
    .to_frame()
    .reset_index()
)

lang_count.head()

Unnamed: 0,index,language
0,uk,231121
1,ru,157016
2,zh-tw,154006
3,en,133134
4,ar,110028


In [None]:
x = train_df.values[:,  0]
label_encoder = LabelEncoder().fit(train_df.values[:, 1])
y = label_encoder.transform(train_df.values[:, 1])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
pipe_word = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 4), analyzer='char')),
    ('model', SGDClassifier(random_state=42, loss='log',class_weight='balanced'))
    ])

In [None]:
pipe_word.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 4), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabula...
                 SGDClassifier(alpha=0.0001, average=False,
                               class_weight='balanced', early_stopping=False,
                               epsilon=0.1, eta0=0.0, fit_intercept=True,
                               l1_ratio=0.15, learning_rate='optimal',
                               loss='log', max_iter=1

In [None]:
predictions_w = pipe_word.predict(x_test)

In [None]:
balanced_accuracy_score(y_test, predictions_w)

0.93494009958509

In [None]:
submit_predictions = pipe_word.predict(test_df.sentence.values)

In [None]:
test_df['language'] = label_encoder.classes_[submit_predictions]

In [None]:
test_df[["index", "language"]].to_csv("res_lan1.csv", index=False)