# Neural Networks for Text Classification

In [None]:
import datetime
import glob
import os
import re

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import LSTM, Conv1D, Dense, Embedding, Flatten, MaxPooling1D
from keras.models import Sequential
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
# import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


## Loading and Preparing Text Data for Analysis

In [None]:
# specify working directory
os.chdir("../bulgarian-constitutional-court-decisions/")
# load all json files in data directory
files = glob.glob("data/json/*.json")
data = []

# for loop for processing files and adding doc id
for json in files:
    frame = pd.read_json(json)
    # get file name as string
    # create column identifying dfs as doc_id
    # split string (remove .json from file name)
    frame["doc_id"] = os.path.splitext(os.path.basename(json))[0]
    data.append(frame)

# concatenate all data frames
df = pd.concat(data, ignore_index=True)

In [None]:
# create binary variable where POLITICAL = 1, all else = 0
df.loc[df["label_id"] != 4, "label_id"] = 0

df.loc[df["label_id"] == 4, "label_id"] = 1


In [None]:
# function for preprocessing text data
def preprocessing(text):

    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    stop_words = set(stopwords.words("english"))
    words = [word for word in text.lower().split() if not word in stop_words]
    text = " ".join(words)

    return text

In [None]:
# applying preprocessing function to df
df['text'] = df['text'].apply(preprocessing)


## Building Word Vector Embeddings

In [None]:
# vector parameters
vocab_size = 3500
max_length = 1000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
X = df.text
y = df.label_id.values
# splitting data into train and test splits in order to test predictive accuracy
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size=0.3, shuffle=False
)

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# fitting tokenizer only to training set
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

# creating training sequences and padding them
train_seq = tokenizer.texts_to_sequences(X_train)
train_pad = pad_sequences(
    train_seq,
    maxlen=max_length,
    padding=padding_type,
    truncating=trunc_type,
)

# creating testing sequences and padding them using same tokenizer
test_seq = tokenizer.texts_to_sequences(X_test)
test_pad = pad_sequences(
    test_seq,
    maxlen=max_length,
    padding=padding_type,
    truncating=trunc_type,
)

# converting all variables to numpy arrays (correct format for the latest version of tensorflow)
train_seq = np.array(train_seq)
train_pad = np.array(train_pad)
y_train = np.array(y_train)
test_seq = np.array(test_seq)
test_pad = np.array(test_pad)
y_test = np.array(y_test)

## Fitting Convolutional Neural Network

Convolutional neural networks (CNNs) are designed for processing large arrays of structured data. They are most popular for computer vision tasks, but they can be effectively applied to natural language processing in certain cases, particularly text classification.

The architecture of a CNN is a multi-layered "feed-forward" neural network. A feed-forward network is a network whose nodes do not form a cycle (an example of a network that forms a cycle is a recurrent neural network).

A diagram example of a convolutional neural network helps demonstrate how the process works:

![CNN](cnn.png)

In [None]:
# load tensorboard (for use later in the notebook)
%load_ext tensorboard
# clear any logs from previous tensorboard runs
!rm -rf ./logs/

In [None]:
model = Sequential([
    # adding an embedding layer for neural net to learn the vectors
    Embedding(vocab_size, embedding_dim, input_length = max_length),
    # convolutional layer
    Conv1D(64, 5, activation='relu'),
    # pooling layer
    MaxPooling1D(5),
    # flattens the input, converting matrix to single array
    # reducing tensor to a single dimension
    Flatten(),
    # dense layer, which means each neuron in the layer receives input from all neurons of previous layer
    # activation function is used to map the output of one layer to another
    Dense(units=64, activation='relu'),
    Dense(units=1, activation='sigmoid')
    ])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%m/%d -- %H:%M")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(
    train_pad,
    y_train,
    epochs = 10,
    validation_data = (test_pad,y_test),
    callbacks=[tensorboard_callback])


In [None]:
scores = model.evaluate(test_pad, y_test, verbose=1)
print("Accuracy:", scores[1])

## Fitting Long Short-Term Memory Network

Long Short-Term Memory (LSTM) models are a neural network that has become popular in natural language processing. Its popularity owes to the specifics of its architecture. LSTMs are designed to work on sequence data, and therefore it treats text data in the sequence that it appears (unlike other methods like Bag of Words).

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length = max_length),
    # long short-term memory layer
    LSTM(64),
    Dense(units=1, activation='sigmoid')
    ])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%m/%d -- %H:%M")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(
    train_pad,
    y_train,
    epochs = 10,
    validation_data = (test_pad,y_test),
    callbacks=[tensorboard_callback])

In [None]:
scores = model.evaluate(test_pad, y_test, verbose=1)
print("Accuracy:", scores[1])

In [None]:
plot_model(model, to_file='model.png')

## Tensorboard

In [None]:
%tensorboard --logdir logs/fit --port 6060