## Using 2-Layer Neural Network to do Toxic Comment Classification

In [3]:
import pandas as pd, numpy as np

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
subm = pd.read_csv('./sample_submission.csv')

trainingdata = train.comment_text
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

test_labels = pd.read_csv('./test_labels.csv')
test_labels_filter = test_labels[test_labels['toxic']>-1]
test_filter = test[test.id.isin(test_labels_filter.id)]

In [4]:
import random
indexes = random.sample(range(1, trainingdata.shape[0]), 10000)

In [5]:
trainingdata = trainingdata[indexes]

### Step1. Generate Features by TFIDFVectorizer

Vectorize each comment, so it can be the input of Neural Networks.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words='english', max_df = 0.7)
X_train_tfidf = vect.fit_transform(trainingdata)
X_train_tfidf.shape

(10000, 35707)

In [7]:
train_x = X_train_tfidf.toarray()

In [8]:
train_y = train["toxic"][indexes].values

In [9]:
split_size = int(train_x.shape[0]*0.7)

train_x, val_x = train_x[:split_size], train_x[split_size:]
train_y, val_y = train_y[:split_size], train_y[split_size:]

### Step2. Build a computational graph by tensorflow

Define Variables

In [10]:
# To stop potential randomness
# import tensorflow
import tensorflow as tf
import numpy as np
seed = 128
rng = np.random.RandomState(seed)

In [11]:
### set all variables

# number of neurons in each layer
input_num_units = X_train_tfidf.shape[1]
hidden_num_units = 100
output_num_units = 2

In [12]:
# define placeholders
x = tf.placeholder(tf.float32, [None, input_num_units])
y = tf.placeholder(tf.float32, [None, output_num_units])

# set remaining variables
epochs = 3
batch_size = 64
learning_rate = 0.005

### define weights and biases of the neural network (refer this article if you don't understand the terminologies)

weights = {
    'hidden': tf.Variable(tf.random_normal([input_num_units, hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([hidden_num_units, output_num_units], seed=seed))
}

biases = {
    'hidden': tf.Variable(tf.random_normal([hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([output_num_units], seed=seed))
}

Define hidden_layer and output_layer

In [13]:
hidden_layer = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden_layer = tf.nn.relu(hidden_layer)

output_layer = tf.matmul(hidden_layer, weights['output']) + biases['output']

Define Cost

In [14]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output_layer, labels=y))

Define optimizer

In [15]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

### Step3. Initialize variables

In [16]:
init = tf.initialize_all_variables()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


### Step4. create session and run the graph

Train the model on the training data, and find predictions on test data

In [19]:
def dense_to_one_hot(labels_dense, num_classes=2):
    """Convert class labels from scalars to one-hot vectors"""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    
    return labels_one_hot
def preproc(unclean_batch_x):
    """Convert values to range 0-1"""
    temp_batch = unclean_batch_x / unclean_batch_x.max()
    
    return temp_batch

def batch_creator(batch_size, dataset_length, dataset_name):
    """Create batch with random samples and return appropriate format"""
    batch_mask = rng.choice(dataset_length, batch_size)
    
    batch_x = eval(dataset_name + '_x')[[batch_mask]]#.reshape(-1, input_num_units)
    batch_x = preproc(batch_x)
    
    if dataset_name == 'train':
        batch_y = eval(dataset_name+"_y")[[batch_mask]]
        batch_y = dense_to_one_hot(batch_y)
        
    return batch_x, batch_y

In [20]:
with tf.Session() as sess:
    # create initialized variables
    sess.run(init)
    
    ### for each epoch, do:
    ###   for each batch, do:
    ###     create pre-processed batch
    ###     run optimizer by feeding batch
    ###     find cost and reiterate to minimize
    
    for epoch in range(epochs):
        avg_cost = 0
        total_batch = int(train_x.shape[0]/batch_size)
        for i in range(total_batch):
            batch_x, batch_y = batch_creator(batch_size, train_x.shape[0], 'train')
            _, c = sess.run([optimizer, cost], feed_dict = {x: batch_x, y: batch_y})
            
            avg_cost += c / total_batch
            
        print("Epoch:", (epoch+1), "cost =", "{:.5f}".format(avg_cost))
    
    print("\nTraining complete!")
    
    # find predictions on val set
    pred_temp = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(pred_temp, "float"))
    print("Validation Accuracy:", accuracy.eval({x: val_x.reshape(-1, input_num_units), y: dense_to_one_hot(val_y)}))
    #auc = tf.contrib.metrics.streaming_auc(tf.argmax(output_layer, 1), tf.argmax(y, 1))
    #print(auc)
    #print(auc.eval())

    predict = tf.argmax(output_layer, 1)
    pred = predict.eval({x: val_x.reshape(-1, input_num_units)})

Epoch: 1 cost = 5.12686
Epoch: 2 cost = 1.71572
Epoch: 3 cost = 1.56413

Training complete!
Validation Accuracy: 0.841333
