<a href="https://colab.research.google.com/github/PopovMihail1/neuralNetworks/blob/master/cnnTextProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!wget https://github.com/ikopeykin/googletest/raw/master/train.csv
!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec
!wget https://github.com/ikopeykin/googletest/raw/master/test.csv
  
import pandas
import numpy as np
import re

from gensim.models import KeyedVectors
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score

stemmer = SnowballStemmer("english")

LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data = pandas.read_csv('train.csv', error_bad_lines=False)

test_data = pandas.read_csv('test.csv', error_bad_lines=False)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub('u', 'you', text)
    text = re.sub('ur', 'your', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    text = stemmer.stem(text)
    return text

def text_to_sent(data):
    input_text_noparens = []
    for i in range(len(data)):  # Should be range(len(data))
        input_text_noparens.append(re.sub(r'\([^)]*\)', '', data['comment_text'][i]))

    sentences = []    
    for sent_str in input_text_noparens:
        tokens = re.sub(r"[^a-z]+", " ", clean_text(sent_str.lower())).split()
        sentences.append(tokens)
        
    return sentences

emot = np.zeros((len(data['comment_text']), len(LABELS)))
for i,label in enumerate(LABELS):
    arr = np.array(data[label])
    emot[:,i] = arr

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

nltk.download('stopwords')

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

tokenizer = Tokenizer(num_words=100000, lower=True, char_level=False)

tokenizer.fit_on_texts(text_to_sent(data) + text_to_sent(test_data))

word_index = tokenizer.word_index
nb_words = min(100000, len(word_index))

word_seq_train = tokenizer.texts_to_sequences(text_to_sent(data))

word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=2500)


import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

# Hyperparameters
batch_size = 128
num_epochs = 8 
num_filters = 32
optimizer = 'adam'

embed_dim = 300

early_stopping=EarlyStopping(monitor='value_loss')

def build_cnn_model():
    model = Sequential()
    model.add(Embedding(nb_words+1, embed_dim, input_length=2500, trainable=False))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(6, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.summary()
    
    return model

train_text_set, valid_text_set, train_emot_set, valid_emot_set = train_test_split(word_seq_train, emot, 
                                                                                  test_size=0.1,
                                                                                  shuffle=True)

model = build_cnn_model()
hist = model.fit(train_text_set, train_emot_set, 
                 batch_size=batch_size, epochs=num_epochs,
                 shuffle=True, verbose=2,
                 callbacks=[early_stopping])

predicted_values = model.predict(valid_text_set, verbose=True)

for i,label in enumerate(LABELS):
  score = roc_auc_score(valid_emot_set[:,i], predicted_values[:,i])
  print('Result for ', label, ' is ', score)


sent = text_to_sent(test_data)
word_seq_train = tokenizer.texts_to_sequences(sent)
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=2500)

results = np.zeros((len(test_data), 6))

results = model.predict(word_seq_train, verbose=False)

import csv

with open('results.csv', 'w', newline='') as csv_file:
    fieldnames=['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    csv_writer.writeheader()
    for i,item in enumerate(results):
        to_push = {fieldnames[0]:test_data['id'][i],
                   fieldnames[1]:item[0],
                   fieldnames[2]:item[1],
                   fieldnames[3]:item[2],
                   fieldnames[4]:item[3],
                   fieldnames[5]:item[4],
                   fieldnames[6]:item[5]
                  }
        csv_writer.writerow(to_push)

from google.colab import files
files.download('results.csv')

from keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True)

!ls -lr

files.download('model.png')