# Text Classification (Newsroom20)

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import os
import pandas as pd
import numpy as np
import re
import string
from nltk import ne_chunk, word_tokenize, pos_tag, Tree
from tqdm.notebook import tqdm

In [None]:
import nltk 
nltk.download('popular')

In [4]:
prefix = '/content/drive/My Drive/documents/' 

# Creating a preprocessing function

In [5]:
def preprocessing(path):
    #global variables
    email=[]
    label = []
    doc_num = []
    subject = []
    text = []
    
    #getting class name
    for filename in os.listdir(path):
        i,j = filename.split('_')
        j = int(j.split('.')[0])
        label.append(i)
        doc_num.append(j)
        
    #for each document in the corpus
    for filename in tqdm(os.listdir(path)):
        
        file = open(path+filename,'r+')
        f =file.read()
        
        em = [] # for each doc
        st = ""  # for each doc
        
        #extracting email
        for i in re.findall(r'[\w\-\.]+@[\w\.-]+\b', f):  #for every email in the doc
            temp=[]
            temp = i.split('@')[1]
            temp = temp.split('.')
            if 'com' in temp:
                temp.remove('com')
            for i in temp:
                if len(i)>2:
                    em.append(i)

        for i in em:
            st+=i
            st+=' '
        
        #extracting subject
        temp1 = re.findall(r'^Subject.*$',f, re.MULTILINE)
        sub = temp1[0]
        sub = sub[7:]
        
        for i in string.punctuation:
            sub = sub.replace(i," ")
        sub = re.sub(r"re","",sub, flags=re.IGNORECASE)
        sub = sub.lower()
        
        #appending the extracted data to a list
        email.append(st)
        subject.append(sub)
        
        
        f = re.sub(r'[\w\-\.]+@[\w\.-]+\b',' ',f)                      #replace email with space
        f = re.sub(r'Subject.*$'," ",f, flags=re.MULTILINE)            #replace subject with space
        f = re.sub(r"Write to:.*$","",f, flags=re.MULTILINE)           #replace write to with nothing 
        f = re.sub(r"From:.*$","",f, flags=re.MULTILINE)               #replace from with nothing 
        f = re.sub(r"or:","",f,flags=re.MULTILINE)
        f = re.sub(r"<.*>","",f, flags=re.MULTILINE)                   #delete <anyword>    
        f = re.sub(r"\(.*\)","",f,flags=re.MULTILINE)                  #delete (contents)
        f = re.sub(r".*:","",f, flags=re.MULTILINE)                    #delete Anyword:
        f = re.sub(r"[\n\t\-\\\/]"," ",f, flags=re.MULTILINE)          #delete /,-,/n,/t
        
        # decontraction
        
        # specific
        f = re.sub(r"won't", "will not", f)
        f = re.sub(r"can\'t", "can not", f)
        # general
        f = re.sub(r"n\'t", " not", f)
        f = re.sub(r"\'re", " are", f)
        f = re.sub(r"\'s", " is", f)
        f = re.sub(r"\'d", " would", f)
        f = re.sub(r"\'ll", " will", f)
        f = re.sub(r"\'t", " not", f)
        f = re.sub(r"\'ve", " have", f)
        f = re.sub(r"\'m", " am", f)

        
        #chunking
        chunks=[]
        chunks=(list(ne_chunk(pos_tag(word_tokenize(f)))))

        for i in chunks:
            if type(i)==Tree:

                if i.label() == "GPE":
                    j = i.leaves()

                    if len(j)>1:                                 #if new york or bigger name 
                        gpe = "_".join([term for term,pos in j])
                        #print(gpe)
                        f = re.sub(rf'{j[1][0]}',gpe,f, flags=re.MULTILINE)  #replacing york with new_york
                        f = re.sub(rf'\b{j[0][0]}\b',"",f, flags=re.MULTILINE) #deleting new,  \b is important

                if i.label()=="PERSON":                                         #removing person
                    for term,pog in i.leaves():
                        f = re.sub(re.escape(term),"",f, flags=re.MULTILINE)

                        
        f = re.sub(r'\d',"",f, flags=re.MULTILINE)                               #remove digits
        f = re.sub(r"\b_([a-zA-z]+)_\b",r"\1",f)                                 #replace _word_ to word
        f = re.sub(r"\b_([a-zA-z]+)\b",r"\1",f)                                  #replace_word to word
        f = re.sub(r"\b([a-zA-z]+)_\b",r"\1",f)                                  #replace word_ to word
        f = re.sub(r"\b[a-zA-Z]{1}_([a-zA-Z]+)",r"\1",f)                         #d_berlin to berlin
        f = re.sub(r"\b[a-zA-Z]{2}_([a-zA-Z]+)",r"\1",f)                         #mr_cat to cat
        f = f.lower()                                                            #lower case
        f = re.sub(r'\b\w{1,2}\b'," ",f)                                         #remove words <2
        f = re.sub(r"\b\w{15,}\b"," ",f)                                         #remove words >15
        f = re.sub(r"[^a-zA-Z_]"," ",f)                                          #keep only alphabets and _
        
        f = re.sub(r" {2,}", " ", f, flags=re.MULTILINE)                         # REMOVE THE EXTRA SPACES

        text.append(f)
        
        file.close()
        
    

    return doc_num, label, email, subject, text
        
        

#### Note: to verify preprocessing function, put break

# Data 

In [None]:
doc_num, label, email, subject, text = preprocessing(prefix)

In [9]:
DATA.head()

Unnamed: 0.1,Unnamed: 0,doc_num,class,Pre_Email,Pre_Subject,Pre_text,preprocessed_text
0,0,49960,12,mantis netcom mantis,alt atheism faq atheist sources,atheism resources resources december atheist ...,mantis netcom mantis alt atheism faq atheis...
1,1,51060,12,mantis mantis mantis,alt atheism faq introduction to atheism,atheism introduction introduction pril egin p...,mantis mantis mantis alt atheism faq introd...
2,2,51119,12,dbstu1 tu-bs mimsy umd edu umd edu,gospel dating,article well has quite different not necessar...,dbstu1 tu-bs mimsy umd edu umd edu gospel ...
3,3,51120,12,mantis kepler unh edu,university violating separation of church ...,recently ras have been ordered and none have ...,mantis kepler unh edu university violating...
4,4,51121,12,Watson Ibm Com harder ccr-p ida org harder ccr...,soc motss et al princeton axes matchi...,however hate economic terrorism and political...,Watson Ibm Com harder ccr-p ida org harder ccr...


In [10]:
X = DATA['preprocessed_text']
Y = DATA['class']

### Splitting data into train, test

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, stratify=Y)

# Vectorizing data

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
t = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')  #removing underscore from filters

In [14]:
t.fit_on_texts(x_train)

In [15]:
vocab_size = len(t.word_index) + 1   # to give 0 index for unknown words.

#### Train Data 

In [16]:
encoded_text_train = t.texts_to_sequences(x_train)

#### Test Data 

In [17]:
encoded_text_test = t.texts_to_sequences(x_test)

## Getting the maximum length for padding the documents 

**texts_to_sequences() returns nothing for new words came across in test data, padding solves the problem by adding zero to them.**

In [18]:
maxx=0
for i in encoded_text_train:
    if len(i)>maxx:
        maxx = len(i)
maxx                              #this is now the length of each document in our corpus

8673

#### Train Data 

In [19]:
padded_text_train = pad_sequences(encoded_text_train, maxlen=maxx, padding='post')
print(padded_text_train)

[[  950    10  8427 ...     0     0     0]
 [  278     1   724 ...     0     0     0]
 [19486 11082 44004 ...     0     0     0]
 ...
 [ 4427  9236 72622 ...     0     0     0]
 [ 1778  1776    10 ...     0     0     0]
 [72623  1794    10 ...     0     0     0]]


#### Test Data 

In [20]:
padded_text_test = pad_sequences(encoded_text_test, maxlen=maxx, padding='post')
padded_text_test

array([[32204,  7847,   194, ...,     0,     0,     0],
       [ 2766,  1947,    10, ...,     0,     0,     0],
       [ 1690,   484,     5, ...,     0,     0,     0],
       ...,
       [51796, 28352,    10, ...,     0,     0,     0],
       [ 6325,  5319,    10, ...,     0,     0,     0],
       [ 3002,  2804,  2250, ...,     0,     0,     0]], dtype=int32)

## Converting class to one hot

In [21]:
from tensorflow.keras.utils import to_categorical

#### Train Data

In [22]:
train_class = to_categorical(y_train)
train_class

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

#### Test Data

In [23]:
test_class = to_categorical(y_test)
test_class

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Loading The Glove Weights 

In [24]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/?unapproved=520126&moderation-hash=5c11e5048633b93910ec2fcc019fce3b#comment-520126

# load the whole embedding into memory
embeddings_index = dict()
glov = open('/content/drive/glove.6B/glove.6B.300d.txt', encoding='utf8')
for line in glov:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glov.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [25]:
# create a weight matrix for words in training docs

embedding_matrix_train = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector_train = embeddings_index.get(word)
    if embedding_vector_train is not None:
        embedding_matrix_train[i] = embedding_vector_train

In [26]:
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, concatenate, Dense, Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
import tensorflow as tf

In [28]:
#input is a text document of size max (padded document)
i_layer= Input(shape=(maxx,)) 

#embedding layer
embedd = Embedding(vocab_size, 300, weights=[embedding_matrix_train], input_length=maxx, trainable=False)(i_layer)

#convolution 1D layer
conv1 = Conv1D(16, kernel_size=9, activation='relu')(embedd)
conv2 = Conv1D(16, kernel_size=6, activation='relu')(embedd)
conv3 = Conv1D(16, kernel_size=3, activation='relu')(embedd)

#concatenate the convolution layers
concate1 = concatenate([conv1, conv2, conv3], axis=1)   #since axis 1 will have different values

#maxpool 1d layer
maxpool1 = MaxPooling1D(pool_size=4)(concate1)

#convlution layer 1D
conv4 = Conv1D(8, kernel_size=8, activation='relu')(maxpool1)
conv5 = Conv1D(8, kernel_size=4, activation='relu')(maxpool1)
conv6 = Conv1D(8, kernel_size=2, activation='relu')(maxpool1)

#concatenate the convolution layers
concate2 = concatenate([conv4,conv5, conv6], axis=1)

#maxpool 1d layer
maxpool2 = MaxPooling1D(pool_size=2)(concate2)

#convolution 1D layer
conv7 =  Conv1D(8, kernel_size=5, activation='relu')(maxpool2)

#flatten
flat = Flatten()(conv7)

#dropout
drop = Dropout(0.5)(flat)

#dense
dense = Dense(10, activation='relu')(drop)

#output layer
o_layer = Dense(20, activation='softmax')(dense)

#define the mode
model = Model(inputs=i_layer, outputs=o_layer)

#compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#summarize the model
print(model.summary())


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 8673)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 8673, 300)    21787200    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 8665, 16)     43216       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 8668, 16)     28816       embedding_1[0][0]                
____________________________________________________________________________________________

# Callbacks

In [33]:
import datetime 
#tensorboard callback
log_dir="/content/drive/My Drive/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") #dirctory according to time
tfboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1,write_graph=True)

#early stopping callback
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=5)

#modelcheckpoint callback to save the best model 
filepath = '/content/drive/My Drive/weights.{epoch:02d}-{val_loss:.2f}.hdf5'    #saves in the name of epoch and val_loss
chk_callback = tf.keras.callbacks.ModelCheckpoint(filepath,monitor='val_loss', mode='min',
                                                 save_best_only=True, save_weights_only=True)

In [None]:
model.fit(padded_text_train,train_class, epochs=30, validation_data=(padded_text_test,test_class), 
         callbacks=[tfboard_callback, es_callback, chk_callback])

## Tensorboard M1 

### Observation:
1. **Minimum val-loss at epoch 4, so the EarlyStopping stopped the training after no improvements in 5 epochs after it.**
2. **The weights are stored as "weight.04-0.95.hdf5" (epoch-4 has minimum val_loss-0.95 and accuracy at 70%)**
3. **Red line- Validation**


<img src="https://imgur.com/QhXj351.png">

<img src="https://imgur.com/889JQLm.png">