In [None]:
import ast
import re
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import utils
from keras.layers import Dense, Activation, Dropout, GlobalMaxPooling1D, Conv1D, Embedding, SpatialDropout1D, LSTM
from keras.models import Sequential
from keras.preprocessing import text, sequence
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

In [None]:
with open("from_big_merged.txt", "r", encoding="utf8") as f:
    from_big = [a.strip() for a in set(f.read().splitlines()) if len(a.strip()) > 10]

In [3]:
with open("not_from_big_merged.txt", "r", encoding="utf8") as f:
    not_from_bigr = [a.strip() for a in set(f.read().splitlines()) if len(a.strip()) > 10]

In [4]:
print(len(from_big))

8488


In [None]:
dataset = []
for mal in from_big:
    dataset.append((mal, 0))
    
for fem in not_from_big:
    dataset.append((fem, 1))
    
import random
print(len(dataset))
random.shuffle(dataset)
dataset = list(set(dataset))
print(len(dataset))


In [None]:
df = defaultdict(list)

for data, target in dataset:
    df['data'].append(data)
    df['target'].append(target)
    
df = pd.DataFrame(df)

In [None]:
import re
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9а-я #+_]')
STOPWORDS = set(stopwords.words('russian'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text)  
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

df['data'] = df['data'].apply(clean_text)
df['data'] = df['data'].str.replace('\d+', '')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['data'],df['target'], test_size = 0.20, random_state = 42)

In [None]:
vec = TfidfVectorizer()
clf = LogisticRegression()

sgd = make_pipeline(vec, clf)
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

In [None]:
vec = TfidfVectorizer()
clf = LinearSVC()

sgd = make_pipeline(vec, clf)
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

In [None]:
sgd = Pipeline([('vect', TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4), tokenizer=word_tokenize, sublinear_tf=True, max_df=0.95)),
                ('clf', LinearSVC(C=0.6)),
               ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

In [None]:
sgd = Pipeline([('vect', TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 4), tokenizer=word_tokenize, sublinear_tf=True, max_df=0.95)),
                ('clf', LogisticRegression(C=0.9)),
               ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

In [None]:
print(sgd.predict(["у нас в Москве всё хорошо. В кольце живется лучше, чем за кольцом, но в целом все довольны"]))
print(sgd.predict(["денег нет, мушщина альфонс, бездельничает и деньги прожигает"]))

In [None]:
import eli5
eli5.show_weights(clf, vec=vec, top=50,
                  target_names=[0, 1])

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history.get('acc') or history.history.get('accuracy')
    val_acc = history.history.get('val_acc') or history.history.get('val_accuracy')
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, GlobalMaxPooling1D, Conv1D, Embedding, SpatialDropout1D, LSTM
from keras.preprocessing import text, sequence
from keras import utils

train_len = int(len(df['data']) * 80 / 100)

train_posts = df['data'][:train_len]
train_tags = df['target'][:train_len]

test_posts = df['data'][train_len:]
test_tags = df['target'][train_len:]

max_words = 15000
maxlen = 600
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train

x_train = tokenize.texts_to_sequences(train_posts)
x_test = tokenize.texts_to_sequences(test_posts)

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)


encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
embedding_dim = 100
batch_size = 32
epochs = 10

In [None]:
# Build the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Conv1D(filters=128,  # filters
                 kernel_size=3,  # kernel_size
                 activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(256))  # hidden dims
model.add(Activation('relu'))
model.add(Dropout(0.2))

# We project onto a 3 unit output layer, and activate it with softmax:
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=5,
                    verbose=1,
                    validation_split=0.2)
#                     validation_data=(x_test,y_test))

In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# Build the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Conv1D(filters=128,  # filters
                 kernel_size=3,  # kernel_size
                 activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(128))  # hidden dims
model.add(Activation('relu'))
model.add(Dropout(0.5))

# We project onto a 3 unit output layer, and activate it with softmax:
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=5,
                    verbose=1,
                    validation_split=0.2)
#                     validation_data=(x_test,y_test))

In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(max_words, 100, input_length=600))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())


In [None]:
history = model.fit(x_train,
                    y_train, 
                    epochs=5, 
                    batch_size=64,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(max_words, 100, input_length=600))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())


In [None]:
history = model.fit(x_train,
                    y_train, 
                    epochs=5, 
                    batch_size=64,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

In [None]:
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)