In [None]:
import string
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

import keras
import keras_metrics
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras.optimizers import *


from keras import Sequential
from keras import regularizers
from keras.utils import np_utils
from keras.utils import Sequence
from keras.initializers import Constant
from keras.utils.vis_utils import plot_model

#### Loading of trained Doc2vec model

In [None]:
model = Doc2Vec.load('/content/drive/MyDrive/Thesis - Dataset and Transformations/doc2vec/Doc2Vec_500d_model/Doc2vec_500D.model') 

#### Reading the dataset - Augmented from the actual dataset (Without summarization)

In [None]:
lsa_svd = pd.read_csv('/content/drive/MyDrive/Thesis - Dataset and Transformations/doc2vec/document_doc2vec_cnn_classification.csv')
lsa_svd.tail(5)

#### Inferring the vectors for the documents from the trained document embedding model

In [None]:
def infer_vector(df):
    '''
    Function: Infer a vector for given post-bulk training document. Document should be a list of (word) tokens.
    '''
    inferred_vector = []
    vector_label = []
    table = str.maketrans(dict.fromkeys(string.punctuation))
    
    # infer_vector() requires its doc_words argument to be a list of tokens â€“ matching the same kind 
    # of tokenization that was used in training the model.
    for idx, row in df.iterrows():
        print('Inferring vectors for', idx)
        inferred_vector.append(model.infer_vector(row['Summarized_content'].translate(table).split(), 
                                                  epochs = 40, alpha = 0.025))
        vector_label.append(row['Labels']) 
    return inferred_vector, vector_label

In [None]:
def plot_graph(hist_of_A, hist_of_B, title, xlabel, ylabel):
    figure(num=None, figsize=(5, 4), dpi=350)
    plt.plot(history.history[hist_of_A])
    plt.plot(history.history[hist_of_B])
    plt.title(title)
    plt.xlim([0, 15])
    plt.xticks(np.arange(0, 25, 5))
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.legend(['train', 'valid'], loc='upper right')
    plt.show()

In [None]:
def heatconmat(y_true,y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(15,12))
    sns.heatmap(confusion_matrix(y_true,y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(y_true.unique()))
    plt.show()

In [None]:
inferred_vector, vector_label = infer_vector(lsa_svd)
inputs = np.array(inferred_vector)
targets = np.array(vector_label)
inputs = inputs.reshape((inputs.shape[0],inputs.shape[1],1))
True_labels = list(dict.fromkeys(vector_label))

#### Train-test split

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)

#### Defining all the types of callbacks to be included

In [None]:
call_reduce = ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=1, verbose=2,
                                mode='auto', min_delta=0.05, cooldown=0, min_lr=0)

filepath=r"C:\\Users\\Shrikanth Singh\\Desktop\\Thesis-Note-to-Py\\bestofbest.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

EarlyStopping_call = EarlyStopping(monitor='val_acc', patience=5, mode='auto')

callbacks_list = [call_reduce, checkpoint, EarlyStopping_call]

#### Model definintion, K-fold cross validation, Model fitting and evaluation

In [None]:
acc_per_fold = []
loss_per_fold = []

num_folds = 5
# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(X_train_val, y_train_val):

  # Define the model architecture
    model_cnn = Sequential()

    model_cnn.add(Conv1D(256, 3, activation='relu', padding='same', strides=1, 
                         kernel_regularizer=regularizers.l2(5e-4), input_shape=(500,1), 
                         use_bias=True, bias_initializer='TruncatedNormal', bias_regularizer=regularizers.l2(5e-4)))
    model_cnn.add(MaxPooling1D(2))
    model_cnn.add(Dropout(0.5))

    model_cnn.add(Conv1D(256, 3, activation='relu', padding='same',strides=1, 
                         kernel_regularizer=regularizers.l2(5e-4), 
                         use_bias=True, bias_initializer='TruncatedNormal', bias_regularizer=regularizers.l2(5e-4)))
    model_cnn.add(MaxPooling1D(2))
    model_cnn.add(Dropout(0.5))

    model_cnn.add(Flatten())
    
    model_cnn.add(Dense(200))
    model_cnn.add(Dropout(0.5))
    
    model_cnn.add(Dense(len(True_labels), activation='softmax'))

  # Compile the model
    model_cnn.compile(loss='sparse_categorical_crossentropy',
                      optimizer='rmsprop', 
                      metrics=['acc'])

  # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

  # Fit data to model
    
    history = model_cnn.fit(X_train_val[train], y_train_val[train],
              batch_size=16,
              epochs=2,
              verbose=1,
              validation_split=0.2,
              callbacks=callbacks_list)
    
    plot_graph(hist_of_A='acc', hist_of_B='val_acc', title='model accuracy', xlabel='epoch', ylabel='accuracy')
    plot_graph(hist_of_A='loss', hist_of_B='val_loss', title='model loss', xlabel='epoch', ylabel='loss')
    
  # Generate generalization metrics
    model_cnn.load_weights(r"C:\\Users\\Shrikanth Singh\\Desktop\\Thesis-Note-to-Py\\bestofbest.hdf5")
    scores = model_cnn.evaluate(X_train_val[test], y_train_val[test], verbose=0)
    print(f'Score for fold {fold_no}: {model_cnn.metrics_names[0]} of {scores[0]}; {model_cnn.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

  # Increase fold number
    fold_no = fold_no + 1

In [None]:
print('------------------------------------------------------------------------')
print('Score per fold')  
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------') 
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

#### Classification performance on test data

In [None]:
predicted_labels = model_cnn.predict_classes(X_test)
print(classification_report(y_test,predicted_labels))

In [None]:
heatconmat(pd.Series(y_test),predicted_labels)