In [1]:
# Main libraries
import numpy as np
import pandas as pd

# import keras as ks
import keras as ks
import nltk
import re
import codecs


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from keras.preprocessing.text import Tokenizer

%matplotlib inline

In [2]:
def load_data_from_arrays(strings, labels, train_test_split=0.9):
    data_size = len(strings)
    test_size = int(data_size - round(data_size * train_test_split))
    print("Test size: {}".format(test_size))
    
    print("\nTraining set:")
    x_train = strings[test_size:]
    print("\t - x_train: {}".format(len(x_train)))
    y_train = labels[test_size:]
    print("\t - y_train: {}".format(len(y_train)))
    
    print("\nTesting set:")
    x_test = strings[:test_size]
    print("\t - x_test: {}".format(len(x_test)))
    y_test = labels[:test_size]
    print("\t - y_test: {}".format(len(y_test)))

    return x_train, y_train, x_test, y_test

In [3]:
descriptions = clean_train_df['title_clean']
categories = clean_train_df['target']

descriptions_test = clean_test_df['title_clean']

In [4]:
descriptions_encoded = clean_train_encoded_df['title_clean']
categories_encoded = clean_train_encoded_df['target']

In [5]:
descriptions_test_encoded = clean_test_encoded_df['title_clean']

In [6]:
descriptions[:5]

In [7]:
categories[:5]

In [8]:
descriptions_encoded[:5]

In [9]:
categories_encoded[:5]

In [10]:
descriptions_test[:5]

In [11]:
descriptions_test_encoded[:5]

In [12]:
# создаем единый словарь (слово -> число) для преобразования
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions.tolist())

# Преобразуем все описания в числовые последовательности, заменяя слова на числа по словарю.
textSequences = tokenizer.texts_to_sequences(descriptions.tolist())

In [14]:
# # создаем единый словарь (слово -> число) для преобразования
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(descriptions_encoded.tolist())

# # Преобразуем все описания в числовые последовательности, заменяя слова на числа по словарю.
# textSequences = tokenizer.texts_to_sequences(descriptions_encoded.tolist())

In [13]:
X_train, y_train, X_test, y_test = load_data_from_arrays(textSequences, categories, train_test_split=0.8)

In [15]:
# X_train, y_train, X_test, y_test = load_data_from_arrays(textSequences, categories_encoded, train_test_split=0.8)

In [16]:
total_words = len(tokenizer.word_index)
print('В словаре {} слов'.format(total_words))

In [17]:
from tensorflow.keras.utils import to_categorical
# количество наиболее часто используемых слов
num_words = 5000

In [18]:
print(u'Преобразуем описания заявок в векторы чисел...')
tokenizer = Tokenizer(num_words=num_words)
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')

In [19]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

num_classes = np.max(y_train) + 1
print('Количество категорий для классификации: {}'.format(num_classes))

In [20]:
print('Размерность X_train:', X_train.shape)
print('Размерность X_test:', X_test.shape)

print(u'Преобразуем категории в матрицу двоичных чисел '
      u'(для использования categorical_crossentropy)')
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

In [21]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Dropout

# количество эпох\итераций для обучения
epochs = 7

print(u'Собираем модель...')
model = Sequential()
model.add(Dense(512, input_shape=(num_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

In [22]:
history = model.fit(X_train, y_train,
                    batch_size=32,
                    epochs=epochs,
                    verbose=1)

In [23]:
# serialize model to JSON
model_json = model.to_json()

with open("model_normal.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_normal.h5")
print("Saved model to disk")

In [24]:
from keras.models import model_from_json

# load json and create model
json_file = open('model_normal.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_normal.h5")
print("Loaded model from disk")

In [25]:
loaded_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [26]:
score = loaded_model.evaluate(X_test, y_test,
                       batch_size=32, verbose=1)
print()
print(u'Оценка теста: {}'.format(score[0]))
print(u'Оценка точности модели: {}'.format(score[1]))

In [27]:
# import matplotlib.pyplot as plt

# # График точности модели
# plt.plot(history.history['accuracy'])
# # plt.plot(history.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()
# # График оценки loss
# plt.plot(history.history['loss'])
# # plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

In [28]:
# score = model.evaluate(X_test, y_test,
#                        batch_size=32, verbose=1)
# print()d
# print(u'Оценка теста: {}'.format(score[0]))
# print(u'Оценка точности модели: {}'.format(score[1]))

In [29]:
descriptions_test = clean_test_df['title_clean']

In [30]:
# создаем единый словарь (слово -> число) для преобразования
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions_test.tolist())

# Преобразуем все описания в числовые последовательности, заменяя слова на числа по словарю.
textSequences = tokenizer.texts_to_sequences(descriptions_test.tolist())

In [31]:
# X_train_, y_train_, X_test_, y_test_ = load_data_from_arrays(textSequences, categories, train_test_split=.8)

In [32]:
tokenizer = Tokenizer(num_words=num_words)
X_test_ = tokenizer.sequences_to_matrix(textSequences, mode='binary')
# X_train_ = tokenizer.sequences_to_matrix(X_train_, mode='binary')

In [33]:
y_NN_predict_1 = loaded_model.predict(X_test_)

In [34]:
predicted_lablels = []
for i in range(len(y_NN_predict_1)):
    predicted_lablels.append(np.argmax(y_NN_predict_1[i]))

In [35]:
predicted_lablels = np.array(predicted_lablels)
predicted_lablels = np.where(predicted_lablels == 0, False, True)

In [36]:
import collections
c = collections.Counter()
for label in predicted_lablels:
    c[label] += 1

In [37]:
clean_test_df["target"] = predicted_lablels


# Create file and read in stdout
clean_test_df[["id", "target"]].to_csv("ml_network.csv", index=False)
!cat ml_baseline.csv | head

In [38]:
from keras.preprocessing.text import Tokenizer

# создаем единый словарь (слово -> число) для преобразования
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions.tolist())

# Преобразуем все описания в числовые последовательности, заменяя слова на числа по словарю.
textSequences = tokenizer.texts_to_sequences(descriptions.tolist())

In [39]:
X_train, y_train, X_test, y_test = load_data_from_arrays(textSequences, categories, train_test_split=0.8)
# Максимальное количество слов в самом длинном описании заявки
max_words = 0
for desc in descriptions.tolist():
    words = len(desc.split())
    if words > max_words:
        max_words = words
print('Максимальное количество слов в самом длинном описании заявки: {} слов'.format(max_words))

total_unique_words = len(tokenizer.word_counts)
print('Всего уникальных слов в словаре: {}'.format(total_unique_words))

maxSequenceLength = max_words

In [40]:
vocab_size = round(total_unique_words/10)
vocab_size

In [41]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

num_classes = np.max(y_train) + 1
print('Количество категорий для классификации: {}'.format(num_classes))

In [42]:
print(u'Преобразуем описания заявок в векторы чисел...')
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(descriptions.tolist())

In [43]:
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')

In [44]:
from keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen=maxSequenceLength)
X_test = pad_sequences(X_test, maxlen=maxSequenceLength)

In [45]:
print('Размерность X_train:', X_train.shape)
print('Размерность X_test:', X_test.shape)

print(u'Преобразуем категории в матрицу двоичных чисел '
      u'(для использования categorical_crossentropy)')
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

In [46]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

# максимальное количество слов для анализа
max_features = vocab_size

print(u'Собираем модель...')
model = Sequential()
model.add(Embedding(max_features, maxSequenceLength))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print (model.summary())

In [47]:
batch_size = 32
epochs = 3

print(u'Тренируем модель...')
history = model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))
