In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from keras.utils import to_categorical
from keras import optimizers
import re
from stop_words import get_stop_words
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report,precision_score,recall_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
import calendar
import datetime

In [None]:
data1=pd.read_csv("GL Datasets/upsample_train_data/Train_Up_data.csv")
data2=pd.read_csv("GL Datasets/Simple data/Test_data_GL.csv")
data3=pd.read_csv("GL Datasets/Simple data/Val_data_GL.csv")
X_train=data1.Text
y_train=data1.Label
X_test=data2.Text
y_test=data2.Label
X_val=data3.Text
y_val=data3.Label
print(X_train.shape,X_val.shape,X_test.shape)

In [None]:
def Preprocessing(x):
    import datetime
    # Month name from number
    month_num = 1
    month_abre = datetime.date(2015, month_num, 1).strftime('%b')
    month_name = datetime.date(2015, month_num, 1).strftime('%B')

    # Print list of all months from calendar
    month_abr=[]
    month=[]
    for month_val in range(1, 13):
        month_abr.append(calendar.month_abbr[month_val])
        month.append(calendar.month_name[month_val])
    month_abrev=[word.lower() for word in month_abr ] 
    months=[word.lower() for word in month ]
    months.extend(month_abrev)

    stop_words = (list(
        set(get_stop_words('en'))
        |set(months)
    ))

    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    def preprocess_reviews(reviews):
        reviews= [line.lower() for line in reviews]
        reviews = [str (item) for item in reviews]
        reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
        reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
        return reviews
    reviews_train_clean = preprocess_reviews(x)
    
    def get_lemmatized_text(corpus):
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        return [' '.join([lemmatizer.lemmatize(word,'v') for word in review.split()]) for review in corpus]
    reviews_train_clean = get_lemmatized_text(reviews_train_clean)  
        
    only_alpha=[]
    for i in reviews_train_clean:
        s=re.sub("[^A-Za-z]"," ",str(i))
        s=re.sub("  +"," ",s)
        s = re.sub(r"\bprop\b","property",s)
        s = re.sub(r"\bcvg\b","coverage",s)
        s = re.sub(r'\d',' ', s)
        only_alpha.append(s)
        
    def remove_stop_words(corpus):
        removed_stop_words = []
        for review in corpus:
            removed_stop_words.append(
                ' '.join([word for word in review.split()
                          if word not in stop_words])
            )
        return removed_stop_words
    after_sw = remove_stop_words(only_alpha)
    return(after_sw)

In [None]:
X_train=pd.DataFrame(Preprocessing(X_train),columns=['Text'])
X_train=X_train.Text
X_val=pd.DataFrame(Preprocessing(X_val),columns=['Text'])
X_val=X_val.Text
X_test=pd.DataFrame(Preprocessing(X_test),columns=["Text"])
X_test=X_test.Text
print(len(y_train),len(y_val),len(y_test))

In [None]:
max_fatures = 10000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X_train) # training with whole data

X_train1 = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train1,maxlen=150)
print('x_train shape:',X_train.shape)
y_train=np.array(y_train)

X_val1 = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val1,maxlen=150)
print('X_val shape:',X_val.shape)
y_val=np.array(y_val)


X_test = tokenizer.texts_to_sequences(X_test.values)
X_test = pad_sequences(X_test,maxlen=150)
y_test=np.array(y_test)
print("x_test shape", X_test.shape)

In [None]:
import tensorflow as tf
import random as rn
import os
import tensorflow.compat.v1 as tf

os.environ['PYTHONHASHSEED'] = '0'

# Setting the seed for numpy-generated random numbers
np.random.seed(37)

# Setting the seed for python random numbers
rn.seed(1254)

# Setting the graph-level random seed.
tf.set_random_seed(89)

embed_dim = 128
lstm_out = 192
d_r=0.4
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))
model.add(Dropout(d_r))
model.add(Bidirectional(LSTM(lstm_out,return_sequences=True)))
model.add(Dropout(d_r))
model.add(Bidirectional(LSTM(124)))
model.add(Dense(11,activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer=optimizers.Adam(lr=0.001),metrics = ['acc'])

In [None]:
batch_size = 64
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
model.fit(X_train, y_train,epochs = 30,batch_size=batch_size,shuffle=True,validation_data=[X_val,y_val],callbacks=[es],class_weight=class_weights,verbose = 1)
#model.fit(X_train, y_train, epochs = 15, batch_size=batch_size, shuffle=True, validation_data=[X_val,y_val],callbacks=[es],verbose = 1)

In [None]:
Y_pred = model.predict_classes(X_test)
print(accuracy_score(Y_pred,y_test))
print(f1_score(Y_pred,y_test,average='macro'))
print(precision_score(Y_pred,y_test,average='macro'))
print(recall_score(Y_pred,y_test,average='macro'))

In [None]:
pd.DataFrame(recall_score(Y_pred,y_test,average=None)).T


In [None]:
print(f1_score(Y_pred,y_test,average=None))
print(precision_score(Y_pred,y_test,average=None))
print(recall_score(Y_pred,y_test,average=None))

###### Hyperparameter tuning (optional)  ##########

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
def create_model(embed_dim=128,lstm_out=192,lstm_last=124,d_r=0.4):
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))
    model.add(Dropout(d_r))
    model.add(Bidirectional(LSTM(lstm_out,return_sequences=True)))
    model.add(Dropout(d_r))
    model.add(Bidirectional(LSTM(lstm_last)))
    model.add(Dense(11,activation='softmax'))
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer=optimizers.Adam(lr=0.001),metrics = ['acc'])
    return model
model1 = KerasClassifier(build_fn=create_model,epochs=30, batch_size=64,verbose=1)

In [None]:
embed_dim=[32,64,128]
lstm_out=[64,124,196]
lstm_last=[64,124,196]
d_r=[0.2,0.3,0.4]
param_grid = dict(embed_dim=embed_dim,lstm_out=lstm_out,lstm_last=lstm_last,d_r=d_r)
grid = GridSearchCV(estimator=model1, param_grid=param_grid, n_jobs=1, cv=3)
grid_result = grid.fit(X_train, y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

###### K fold cross validation  tuning (optional) #########

In [None]:
train_data=pd.concat([data1,data2],axis=0)
train_data.reset_index(inplace=True,drop=True)
X=train_data.Text
Y=train_data.Label
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
acc=[]
f1=[]
precision=[]
recall=[]
for train, test in kfold.split(X,Y):
    max_fatures = 10000
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(X[train]) # training with whole data

    X_train1 = tokenizer.texts_to_sequences(X[train])
    X_train = pad_sequences(X_train1,maxlen=50)
    print('x_train shape:',X_train.shape)
    y_train=np.array(Y[train])

    X_val1 = tokenizer.texts_to_sequences(X[test])
    X_val = pad_sequences(X_val1,maxlen=50)
    print('X_val shape:',X_val.shape)
    y_val=np.array(Y[test])
    
    embed_dim = 128
    lstm_out = 192
    d_r=0.4
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))
    model.add(Dropout(d_r))
    model.add(Bidirectional(LSTM(lstm_out,return_sequences=True)))
    model.add(Dropout(d_r))
    model.add(Bidirectional(LSTM(124)))
    model.add(Dense(11,activation='softmax'))
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer=optimizers.Adam(lr=0.001),metrics = ['acc'])
    batch_size = 64
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
    model.fit(X_train, y_train, epochs = 15, batch_size=batch_size, shuffle=True, validation_data=[X_val,y_val],callbacks=[es],class_weight=class_weights,verbose = 1)
    #model.fit(X_train, y_train, epochs = 15, batch_size=batch_size, shuffle=True, validation_data=[X_val,y_val],callbacks=[es],verbose = 1)
    Y_pred = model.predict_classes(X_test)
    acc.append(accuracy_score(Y_pred,y_test))
    f1.append(f1_score(Y_pred,y_test,average='macro'))
    precision.append(precision_score(Y_pred,y_test,average='macro'))
    recall.append(recall_score(Y_pred,y_test,average='macro'))