In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import keras_metrics
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.utils import to_categorical
pd.set_option('chained_assignment',None)


Using TensorFlow backend.


In [2]:
#Reading the data
train_df = pd.read_csv('train.txt',sep = " ",skip_blank_lines=False,skiprows=[1])
val_df = pd.read_csv('valid.txt',sep = " ",skip_blank_lines=False,skiprows=[1])
test_df = pd.read_csv('test.txt',sep = " ",skip_blank_lines=False,skiprows=[1])

In [3]:
#Function to clean and index the dataset
class PrepareData(object):
    def __init__(self, dataframe):
        self.df = dataframe

    def preprocess_dataset(self):
        self.df = self.df.rename({"-DOCSTART-": "Word", "-X-": "POS", "-X-.1": "Chunk", "O": "NER_tag"}, axis=1)
        indexes_to_drop = self.df.loc[self.df["Word"] == "-DOCSTART-"].index.tolist()
        # df = df.drop(indexes_to_drop,axis = 0 )
        empty_word_cells = self.df.loc[self.df["Word"] == " "].index.tolist()
        #none_word_cells = self.df.loc[self.df["Word"].isnull()].index.tolist()
        indexes_to_drop.extend([i + 1 for i in indexes_to_drop])
        indexes_to_drop.extend(empty_word_cells)
        #indexes_to_drop.extend(none_word_cells)
        indexes_to_drop.sort()
        self.df = self.df.drop(indexes_to_drop, axis=0)
        #print("Null words if any still existing:" + " " + str(len(self.df.loc[self.df["Word"] == " "].index)))
        self.df = self.df.reset_index()
        self.df.drop(["index"], axis=1, inplace=True)
        sent_range = np.where(self.df.isnull().sum(axis=1).to_frame()[0] == 4)[0].tolist()
        self.df["sent_id"] = ""
        for index, value in enumerate(sent_range):
            if index == 0:
                self.df["sent_id"][index:value] = str(index)
            else:
                self.df["sent_id"][sent_range[index - 1] + 1:sent_range[index]] = str(index)
        self.df.drop(np.where(self.df.isnull().sum(axis=1).to_frame()[0] == 4)[0].tolist(), inplace=True)
        self.df.reset_index(inplace=True)
        self.df.drop(["index"], inplace=True, axis=1)
        return self.df
    



#### Running the function on all three datasets

In [4]:
prepare_data = PrepareData(train_df)

In [5]:
train_df = prepare_data.preprocess_dataset()

In [6]:
prepare_val_data = PrepareData(val_df)
val_df = prepare_val_data.preprocess_dataset()

In [7]:
prepare_test_data = PrepareData(test_df)
test_df = prepare_test_data.preprocess_dataset()

In [8]:
#Additional cleaning for NaN words
none_word_cells = train_df.loc[train_df["Word"].isnull()].index.tolist()
train_df.fillna("None",inplace = True)
val_df.fillna("None",inplace = True)
test_df.fillna("None",inplace = True)



In [9]:
weighting_df = train_df["NER_tag"].value_counts().to_frame().reset_index()

In [10]:
sum_of_all_classes = weighting_df["NER_tag"].sum()
weighting_df["percentage"] = 1 - (weighting_df["NER_tag"]/sum_of_all_classes)

In [11]:
weighting_df

Unnamed: 0,index,NER_tag,percentage
0,O,167400,0.168996
1,B-LOC,7140,0.964556
2,B-PER,6600,0.967236
3,B-ORG,6321,0.968621
4,I-PER,4528,0.977522
5,I-ORG,3704,0.981613
6,B-MISC,3438,0.982933
7,I-LOC,1157,0.994256
8,I-MISC,1155,0.994266


In [12]:
#Class to get sentences as sentences will be fed to the lstm
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["NER_tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sent_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [13]:
#Extracting sentences for all 3 datasets
getter = SentenceGetter(train_df)
sentences = getter.sentences
getter_val = SentenceGetter(val_df)
val_sentences = getter_val.sentences
getter_test = SentenceGetter(test_df)
test_sentences = getter_test.sentences

In [14]:
# checking max length 
maxlen = max([len(s) for s in sentences])
maxlen_val = max([len(s) for s in val_sentences])
maxlen_test = max([len(s) for s in test_sentences])
print ('Maximum sequence length for train,val,test:', maxlen , " " ,maxlen_val ," ", maxlen_test)


Maximum sequence length for train,val,test: 113   109   124


In [15]:
words = list(set(train_df["Word"].values))
words_val = list(set(val_df["Word"].values))
words_test = list(set(test_df["Word"].values))
words.append("ENDPAD")
words_val.append('ENDPAD')
words_test.append('ENDPAD')

In [16]:
#Length of all words
n_words = len(words)
n_words_val = len(words_val)
n_words_test = len(words_test)

In [17]:
# Get all distinct tags
tags = list(set(train_df["NER_tag"].values))
tags_val = list(set(val_df["NER_tag"].values))
tags_test = list(set(test_df["NER_tag"].values))

In [18]:
n_tags = len(tags)
n_tags_val = len(tags_val)
n_tags_test = len(tags_test)

In [19]:
# Changing to numerical representation of words and tags
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
word2idx_val = {w: i for i, w in enumerate(words_val)}
tag2idx_val = {t: i for i, t in enumerate(tags_val)}
word2idx_test = {w: i for i, w in enumerate(words_test)}
tag2idx_test = {t: i for i, t in enumerate(tags_test)}


In [20]:
# Changing the words in the dataset to numbers for each sentence
X = [[word2idx[w[0]] for w in s] for s in sentences]
X_val = [[word2idx_val[w[0]] for w in s] for s in val_sentences]
X_test = [[word2idx_test[w[0]] for w in s] for s in test_sentences]

In [21]:
# Changing the tags in the dataset to numbers for each sentence
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y_val = [[tag2idx_val[w[1]] for w in s] for s in val_sentences]
y_test = [[tag2idx_test[w[1]] for w in s] for s in test_sentences]

In [22]:
# Padding with a fixed length for words . The pad value is "ENDPAD"
X = pad_sequences(maxlen=130, sequences=X, padding="post",value=n_words - 1)
X_val = pad_sequences(maxlen=130, sequences=X_val, padding="post",value=n_words_val - 1)
X_test = pad_sequences(maxlen=130, sequences=X_test, padding="post",value=n_words_test - 1)

In [50]:
#Building a class weight dictionary for imbalanced dataset
#weight_dict = dict()
#for key,value in tag2idx.items():
#    per = weighting_df["percentage"].loc[weighting_df["index"] == key].item()
#    weight_dict[value] = per
    

In [51]:
#weight_dict

In [26]:
# Padding with a fixed length for words . The pad value is "O"
y = pad_sequences(maxlen=130, sequences=y, padding="post", value=tag2idx["O"])
y_val = pad_sequences(maxlen=130, sequences=y_val, padding="post", value=tag2idx_val["O"])
y_test = pad_sequences(maxlen=130, sequences=y_test, padding="post", value=tag2idx_test["O"])

In [27]:
# Changing y to one hot encoding type
y = [to_categorical(i, num_classes=n_tags) for i in y]
y_val = [to_categorical(i, num_classes=n_tags_val) for i in y_val]
y_test = [to_categorical(i, num_classes=n_tags_test) for i in y_test]

In [33]:
# Creating LSTM model
input = Input(shape=(130,))
model = Embedding(input_dim=n_words, output_dim=130, input_length=130)(input)
model = Dropout(0.1)(model)
model = (LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)
model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy",keras_metrics.f1_score()])

In [34]:
history = model.fit(X, np.array(y), batch_size=24, epochs=3,verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [35]:
prediction = model.predict(np.array(X_test))
p = np.argmax(prediction, axis=-1)
