# Тестовое задание ЦБ РФ

In [4]:
import sklearn
import pandas as pd
import pymorphy2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit, cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

In [5]:
positive = pd.read_csv('positive.csv', sep=';', header=None)
negative = pd.read_csv('negative.csv', sep=';', header=None)
dataset = pd.concat([positive, negative])
dataset = dataset[[3, 4]]
dataset.columns = ['text', 'label']
dataset.head()

Unnamed: 0,text,label
0,"@first_timee хоть я и школота, но поверь, у на...",1
1,"Да, все-таки он немного похож на него. Но мой ...",1
2,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,1
3,"RT @digger2912: ""Кто то в углу сидит и погибае...",1
4,@irina_dyshkant Вот что значит страшилка :D\nН...,1


In [6]:
morph = pymorphy2.MorphAnalyzer()

In [7]:
def text_cleaner(text):
    text = text.lower()
    alph = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
    # удаляем все кроме русских букв
    cleaned_text = ''
    for char in text:
        if (char.isalpha() and char[0] in alph) or (char == ' '):
            cleaned_text += char       
    result = []
    # лемматизация
    for word in cleaned_text.split():        
        result.append(morph.parse(word)[0].normal_form)                              
    return ' '.join(result)

dataset['text'] = dataset['text'].apply(text_cleaner)
dataset.to_csv('cleaned_data.csv')

In [8]:
dataset.head()

Unnamed: 0,text,label
0,хоть я и школотый но поверь у мы то же самый о...,1
1,да всетаки он немного похожий на он но мой мал...,1
2,ну ты идиотка я испугаться за ты,1
3,кто то в угол сидеть и погибать от голод а мы ...,1
4,вот что значит страшилка но блинпосмотреть вес...,1


In [5]:
dataset = pd.read_csv('cleaned_data.csv', index_col=0).dropna()

In [6]:
ngram = [(1, 1), (1, 2), (1, 3), (2, 2), (3, 3)]

In [7]:
for ngram_scheme in ngram:
    print('N-gram:', ngram_scheme)

    count_vectorizer = CountVectorizer(analyzer = "word", ngram_range=ngram_scheme) 
    tfidf_vectorizer = TfidfVectorizer(analyzer = "word", ngram_range=ngram_scheme)
    vectorizers = [count_vectorizer, tfidf_vectorizer]
    vectorizers_names = ['Vectorizer', 'TF-IDF']

    for i in range(len(vectorizers)):
        print(vectorizers_names[i])
        vectorizer = vectorizers[i]
        X = vectorizer.fit_transform(dataset['text'])
        y = dataset['label']
        cv = ShuffleSplit(test_size=0.1, random_state=42)
        clf = MultinomialNB()
        NB_result = cross_val_score(clf, X, y, cv=cv).mean()

        print('Naive:', NB_result.mean())
        print('-----------------------------------')

N-gram: (1, 1)
Vectorizer
Naive: 0.7254595952916281
-----------------------------------
TF-IDF
Naive: 0.7216373495569368
-----------------------------------
N-gram: (1, 2)
Vectorizer
Naive: 0.7435303972137725
-----------------------------------
TF-IDF
Naive: 0.7460653352731119
-----------------------------------
N-gram: (1, 3)
Vectorizer
Naive: 0.7433099678173082
-----------------------------------
TF-IDF
Naive: 0.749407044923511
-----------------------------------
N-gram: (2, 2)
Vectorizer
Naive: 0.6876868139135035
-----------------------------------
TF-IDF
Naive: 0.698152801657629
-----------------------------------
N-gram: (3, 3)
Vectorizer
Naive: 0.5962218401446017
-----------------------------------
TF-IDF
Naive: 0.6366662258078738
-----------------------------------


In [8]:
for ngram_scheme in ngram:
    print('N-gram:', ngram_scheme)

    count_vectorizer = CountVectorizer(analyzer = "word", ngram_range=ngram_scheme) 
    tfidf_vectorizer = TfidfVectorizer(analyzer = "word", ngram_range=ngram_scheme)
    vectorizers = [count_vectorizer, tfidf_vectorizer]
    vectorizers_names = ['Vectorizer', 'TF-IDF']
    for i in range(len(vectorizers)):
        print(vectorizers_names[i])
        vectorizer = vectorizers[i]
        X = vectorizer.fit_transform(dataset['text'])
        y = dataset['label']

        cv = ShuffleSplit(test_size=0.1, random_state=42)

        clf = SGDClassifier()
        parameters = {
            'loss': ('log', 'hinge'),
            'penalty': ['none', 'l1', 'l2'],
            'alpha': [0.001, 0.0001, 0.00001, 0.000001]
        }
        gs_clf = GridSearchCV(clf, parameters, cv=cv)
        gs_clf = gs_clf.fit(X, y)
        L_result = gs_clf.best_score_

        print('SGD:', L_result)
        print('Good parameters:', gs_clf.best_params_)
        print('-----------------------------------')

N-gram: (1, 1)
Vectorizer




SGD: 0.7193316580699202
Good parameters: {'alpha': 0.0001, 'loss': 'hinge', 'penalty': 'l2'}
-----------------------------------
TF-IDF
SGD: 0.7297800114623286
Good parameters: {'alpha': 1e-05, 'loss': 'hinge', 'penalty': 'l2'}
-----------------------------------
N-gram: (1, 2)
Vectorizer
SGD: 0.7475378036414936
Good parameters: {'alpha': 1e-05, 'loss': 'log', 'penalty': 'l2'}
-----------------------------------
TF-IDF
SGD: 0.7553939073314817
Good parameters: {'alpha': 1e-06, 'loss': 'log', 'penalty': 'l2'}
-----------------------------------
N-gram: (1, 3)
Vectorizer
SGD: 0.7553939073314817
Good parameters: {'alpha': 1e-05, 'loss': 'log', 'penalty': 'l2'}
-----------------------------------
TF-IDF
SGD: 0.7592381960058193
Good parameters: {'alpha': 1e-06, 'loss': 'log', 'penalty': 'l2'}
-----------------------------------
N-gram: (2, 2)
Vectorizer
SGD: 0.6972887184234889
Good parameters: {'alpha': 1e-05, 'loss': 'log', 'penalty': 'l2'}
-----------------------------------
TF-IDF
SGD: 0.