In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.sequence import pad_sequences
pd.set_option('chained_assignment',None)


Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('train.txt',sep = " ",skip_blank_lines=False,skiprows=[1])
val_df = pd.read_csv('valid.txt',sep = " ",skip_blank_lines=False,skiprows=[1])
test_df = pd.read_csv('test.txt',sep = " ",skip_blank_lines=False,skiprows=[1])

In [3]:
class PrepareData(object):
    def __init__(self, dataframe):
        self.df = dataframe

    def preprocess_dataset(self):
        self.df = self.df.rename({"-DOCSTART-": "Word", "-X-": "POS", "-X-.1": "Chunk", "O": "NER_tag"}, axis=1)
        indexes_to_drop = self.df.loc[self.df["Word"] == "-DOCSTART-"].index.tolist()
        # df = df.drop(indexes_to_drop,axis = 0 )
        empty_word_cells = self.df.loc[self.df["Word"] == " "].index.tolist()
        #none_word_cells = self.df.loc[self.df["Word"].isnull()].index.tolist()
        indexes_to_drop.extend([i + 1 for i in indexes_to_drop])
        indexes_to_drop.extend(empty_word_cells)
        #indexes_to_drop.extend(none_word_cells)
        indexes_to_drop.sort()
        self.df = self.df.drop(indexes_to_drop, axis=0)
        #print("Null words if any still existing:" + " " + str(len(self.df.loc[self.df["Word"] == " "].index)))
        self.df = self.df.reset_index()
        self.df.drop(["index"], axis=1, inplace=True)
        sent_range = np.where(self.df.isnull().sum(axis=1).to_frame()[0] == 4)[0].tolist()
        self.df["sent_id"] = ""
        for index, value in enumerate(sent_range):
            if index == 0:
                self.df["sent_id"][index:value] = str(index)
            else:
                self.df["sent_id"][sent_range[index - 1] + 1:sent_range[index]] = str(index)
        self.df.drop(np.where(self.df.isnull().sum(axis=1).to_frame()[0] == 4)[0].tolist(), inplace=True)
        self.df.reset_index(inplace=True)
        self.df.drop(["index"], inplace=True, axis=1)
        return self.df
    



In [4]:
prepare_data = PrepareData(train_df)

In [5]:
train_df = prepare_data.preprocess_dataset()

In [37]:
prepare_val_data = PrepareData(val_df)
val_df = prepare_val_data.preprocess_dataset()

In [None]:
prepare_test_data = PrepareData(test_df)
test_df = prepare_test_data.preprocess_dataset()

In [10]:
#Additional cleaning for NaN words
none_word_cells = train_df.loc[train_df["Word"].isnull()].index.tolist()
train_df.fillna("None",inplace = True)
val_df.fillna("None",inplace = True)
test_df.fillna("None",inplace = True)



In [22]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["NER_tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sent_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [23]:
getter = SentenceGetter(train_df)
sentences = getter.sentences

In [24]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

Maximum sequence length: 113


In [26]:
words = list(set(train_df["Word"].values))

In [27]:
n_words = len(words); n_words

23621

In [29]:
tags = list(set(train_df["NER_tag"].values))

In [30]:
n_tags = len(tags); n_tags

9

In [31]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [32]:
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [33]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

In [34]:
y = pad_sequences(maxlen=113, sequences=y, padding="post", value=tag2idx["O"])

In [39]:
y

array([[7, 6, 1, ..., 6, 6, 6],
       [0, 8, 6, ..., 6, 6, 6],
       [1, 6, 6, ..., 6, 6, 6],
       ...,
       [6, 0, 8, ..., 6, 6, 6],
       [6, 0, 8, ..., 6, 6, 6],
       [6, 6, 6, ..., 6, 6, 6]])