# Введение в обработку естественного языка
## Урок 6. Классификация текста. Анализ тональности текста
Классификация текстов

In [None]:
import pandas as pd

train_df = pd.read_csv("train.tsv", delimiter="\t")
test_df = pd.read_csv("test.tsv", delimiter="\t")

print('Train size = {}'.format(len(train_df)))
print('Test size = {}'.format(len(test_df)))

In [None]:
train_df.head(3)

In [None]:
positive_words = 'love', 'great', 'best', 'wonderful' 
negative_words = 'worst', 'awful', '1/10', 'crap' 

positives_count = test_df.review.apply(lambda text: sum(word in text for word in positive_words))
negatives_count = test_df.review.apply(lambda text: sum(word in text for word in negative_words))
is_positive = positives_count > negatives_count
correct_count = (is_positive == test_df.is_positive).values.sum()

accuracy = correct_count / len(test_df)

print('Test accuracy = {:.2%}'.format(accuracy))
if accuracy > 0.71:
    from IPython.display import Image, display
    display(Image('https://s3.amazonaws.com/achgen360/t/rmmoZsub.png', width=500))

In [None]:
pattern = re.compile('<br />')

print(pattern.subn(' ', train_df['review'].iloc[3])[0])

In [None]:
train_df['review'] = train_df['review'].apply(lambda text: pattern.subn(' ', text)[0])
test_df['review'] = test_df['review'].apply(lambda text: pattern.subn(' ', text)[0])

In [None]:
def replase_words(text,dict_): 
    output = ''
    for word in text.split(' '):
        word = word.strip()
        if word in dict_.keys(): 
            output += ' ' + dict_[word]
        else:
            output += ' ' + word
    return output

In [None]:
def clean_text(text):
    text = re.sub("@[\w]*","",text)
    text = replase_words(text, emoticon_dict)
    text = replase_words(text, apostrophe_dict)
    text = replase_words(text, short_word_dict)
    text = re.sub("[^\w\s]"," ",text)
    text = re.sub("[^a-zA-Z0-9\_]"," ",text)
    return text

In [None]:
train_df['review'] = train_df['review'].apply(lambda x: clean_text(x))
test_df['review'] = test_df['review'].apply(lambda x: clean_text(x))

In [None]:
train_df

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

dummy_data = ['The movie was excellent',
              'the movie was awful']

dummy_matrix = vectorizer.fit_transform(dummy_data)

print(dummy_matrix.toarray())
print(vectorizer.get_feature_names())

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(train_df['review'].values)

In [None]:
vectorizer.transform([train_df['review'].iloc[3]])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

dummy_data = ['The movie was excellent',
              'the movie was awful']
dummy_labels = [1, 0]

vectorizer = CountVectorizer()
classifier = LogisticRegression()

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(dummy_data, dummy_labels)

print(vectorizer.get_feature_names())
print(classifier.coef_)

In [None]:
model.fit(train_df['review'], train_df['is_positive'])

In [None]:
from sklearn.metrics import accuracy_score

def eval_model(model, test_df):
    preds = model.predict(test_df['review'])
    print('Test accuracy = {:.2%}'.format(accuracy_score(test_df['is_positive'], preds)))

In [None]:
eval_model(model, test_df)

In [None]:
pip install eli5==0.13.0

In [None]:
import eli5
eli5.show_weights(classifier, vec = vectorizer, top = 50)

In [None]:
print('Positive' if test_df['is_positive'].iloc[1] else 'Negative')
eli5.show_prediction(classifier, test_df['review'].iloc[1], vec=vectorizer, 
                     targets=['positive'], target_names=['negative', 'positive'])

In [None]:
print('Positive' if test_df['is_positive'].iloc[6] else 'Negative')
eli5.show_prediction(classifier, test_df['review'].iloc[6], vec=vectorizer, 
                     targets=['positive'], target_names=['negative', 'positive'])

In [None]:
import numpy as np

preds = model.predict(test_df['review'])
incorrect_pred_index = np.random.choice(np.where(preds != test_df['is_positive'])[0])

eli5.show_prediction(classifier, test_df['review'].iloc[incorrect_pred_index],
                     vec=vectorizer, targets=['positive'], target_names=['negative', 'positive'])

### Проверьте повысилось ли качество на стандартных подходах при лемматизации/и без неё

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
nlp = spacy.load("en_core_web_lg", disable=["ner"])

In [None]:
def lemmatize_text(text):
    doc = nlp(text)
    tokens=[token.lemma_.strip() for token in doc]
    text=" ".join(tokens)
    return text

In [None]:
train_lem_df = train_df
test_lem_df = test_df

In [None]:
train_lem_df['review'] = train_lem_df['review'].progress_apply(lambda x: lemmatize_text(x))

In [None]:
test_lem_df['review'] = test_lem_df['review'].progress_apply(lambda x: lemmatize_text(x))

In [None]:
train_lem_df.head(5)

In [None]:
with open('train_docs.pkl', 'wb') as f:
    pickle.dump(train_lem_df,f)
    
with open('test_docs.pkl', 'wb') as f: 
    pickle.dump(test_lem_df,f)

In [None]:
with open('train_docs.pkl', 'rb') as f:
    train_lem_df = pickle.load(f)
    
with open('test_docs.pkl', 'rb') as f:
    test_lem_df = pickle.load(f)

In [None]:
train_lem_df

In [None]:
vectorizer = CountVectorizer()
classifier = LogisticRegression()

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(train_lem_df['review'], train_lem_df['is_positive'])

eval_model(model, test_lem_df)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=20000, analyzer='word')
classifier = LogisticRegression()

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(train_lem_df['review'], train_lem_df['is_positive'])

eval_model(model, test_lem_df)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [None]:
def tagging_text(text):
    doc = nlp(text)
    tokens=[token.ent_type_.strip() if token.ent_type_ !="" else token.text.strip() for token in doc ]

    text = [tokens[i] for i in range(1, len(tokens)) if tokens[i] != tokens[i-1] ]
    text=" ".join(text)
    return text

In [None]:
train_tag_df = train_lem_df
test_tag_df = test_lem_df

In [None]:
train_tag_df['review'] = train_tag_df['review'].progress_apply(lambda x: tagging_text(x))

In [None]:
test_tag_df['review'] = test_tag_df['review'].progress_apply(lambda x: tagging_text(x))

In [None]:
train_tag_df['review'][0]

In [None]:
with open('train_tags.pkl', 'wb') as f:
    pickle.dump(train_tag_df,f)
    
with open('test_tags.pkl', 'wb') as f: 
    pickle.dump(test_tag_df,f)
    
with open('train_tags.pkl', 'rb') as f:
    train_tag_df = pickle.load(f)
    
with open('test_tags.pkl', 'rb') as f:
    test_tag_df = pickle.load(f

In [None]:
vectorizer = CountVectorizer()
classifier = LogisticRegression()

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(train_tag_df['review'], train_tag_df['is_positive'])

eval_model(model, test_tag_df)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=20000, analyzer='word')
classifier = LogisticRegression()

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(train_tag_df['review'], train_tag_df['is_positive'])

eval_model(model, test_tag_df)

### Запустите классификатор и модельки на сеточках

In [None]:
import matplotlib.pyplot as plt 
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Dropout, Conv1D, BatchNormalization, MaxPooling1D#, GlobalAveragePooling

In [None]:
from collections import Counter

words_counter = Counter((word for text in train_tag_df.review for word in text.lower().split()))

word2idx = {
    '': 0,
    '<unk>': 1
}
for word, count in words_counter.most_common():
    if count < 10:
        break
        
    word2idx[word] = len(word2idx)
    
print('Words count', len(word2idx))

In [None]:
def convert(texts, word2idx, max_text_len):
    data = np.zeros((len(texts), max_text_len), dtype=np.int)
    
    for inx, text in enumerate(texts):
        result = []
        for word in text.split():
            if word in word2idx:
                result.append(word2idx[word])
        padding = [0]*(max_text_len - len(result))
        data[inx] = np.array(padding + result[-max_text_len:], dtype=np.int)
    return data

In [None]:
X_train = convert(train_tag_df.review, word2idx, 1000)
X_test = convert(test_tag_df.review, word2idx, 1000)

In [None]:
model = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=256, input_shape=(X_train.shape[1],)),
    GlobalMaxPooling1D(),
    Dense(units=256, activation='relu'),
    Dropout(0.2),
    Dense(units=128, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])


model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, train_tag_df.is_positive, batch_size=1024, epochs=5, 
          validation_data=(X_test, test_tag_df.is_positive))

In [None]:
model.evaluate(X_test, test_tag_df.is_positive, batch_size=1024)

In [None]:
X_train = convert(train_lem_df.review, word2idx, 1000)
X_test = convert(test_lem_df.review, word2idx, 1000)

In [None]:
model = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=256, input_shape=(X_train.shape[1],)),
    GlobalMaxPooling1D(),
    Dense(units=256, activation='relu'),
    Dropout(0.2),
    Dense(units=128, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, train_lem_df.is_positive, batch_size=1024, epochs=5, 
          validation_data=(X_test, test_lem_df.is_positive))

In [None]:
model.evaluate(X_test, test_lem_df.is_positive, batch_size=1024)

Модель на токенезированном датасете лучше, чем только лемматизация.