In [0]:
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros



def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = asarray(parts[1:], dtype='float32')
	return embedding
 
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab, size):
	# total vocabulary size plus 0 for unknown words
	vocab_size = len(vocab) + 1
	# define weight matrix dimensions with all 0
	weight_matrix = zeros((vocab_size, size))
	# step vocab, store vectors using the Tokenizer's integer mapping
	for word, i in vocab.items():
		weight_matrix[i] = embedding.get(word)
	return weight_matrix


In [75]:
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 18 18:53:56 2019

@author: Souparna
"""
import gensim
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split


from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, TimeDistributed, Dropout
from keras.layers import Flatten
from keras.layers import Embedding, Bidirectional, SpatialDropout1D
from keras.layers.recurrent import LSTM
# from keras_contrib.layers import CRF


# from utilities import load_embedding, get_weight_matrix

import pandas as pd
# data_file_path = "CADEC.csv"
data_file_path = '/content/gdrive/My Drive/updated_annotated_data.csv'

w2v_filename = "w2v_embeddings.txt"

data = pd.read_csv(data_file_path)
print(data["label"].value_counts())

tags = list(set(data["label"].values))
n_tags = len(tags);



tag2idx = {t: i for i, t in enumerate(tags)}
# tag2idx["PAD"] = 0
idx2tag = {v: k for k, v in tag2idx.items()}
print("tag2idx",tag2idx)
print("idx2tag", idx2tag)

data['word'] = data['word'].astype(str)

#------------------ word 2 vec ----------------------
sentences = data.groupby(["sec_no"])["word"].apply(list).to_list()

# texts = [gensim.utils.simple_preprocess(i) for word in sentences for i in word ]

# w2v_model = Word2Vec(texts, min_count = 1,  size = 300, window = 5, iter = 50)
# w2v_model.wv.save_word2vec_format(w2v_filename, binary=False)


tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

# max_length = max([len(i) for i in sentences])
max_length = 35

encoded_docs = tokenizer.texts_to_sequences(sentences)


vocab_size = len(tokenizer.word_index) + 1
X_data = pad_sequences(encoded_docs, maxlen=max_length, padding='post', value = vocab_size+1)
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# # load embedding from file
# raw_embedding = load_embedding(w2v_filename)
# # get vectors in the right order
# embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index, 300)
# # create the embedding layer
# embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors],
#                             input_length=max_length, trainable=False)


y_labels = data.groupby(["sec_no"])["label"].apply(list).to_list()

y = [[tag2idx[l_i] for l_i in l] for l in y_labels]
y = pad_sequences(maxlen=max_length, sequences=y, padding="post", value=tag2idx["O"])


# X_data = X_data[:50000]
# y = y[:50000]


x_train, x_test, y_train, y_test = train_test_split(X_data,y , test_size = 0.1)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


model = Sequential()
# model.add(embedding_layer)
model.add(Embedding(vocab_size, output_dim=40, 
                  input_length=max_length, mask_zero=True))
model.add(Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.1)))


model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
# model.add(TimeDistributed(Dense(50, activation="relu")))
# crf = CRF(n_tags+1, sparse_target=True)
# model.add(crf)

model.compile(optimizer='Adam', loss="sparse_categorical_crossentropy", metrics = ["accuracy"])
# model.compile(optimizer='rmsprop', loss=crf.loss_function, metrics = [crf.accuracy])
print(model.summary())

history = model.fit(x_train, y_train.reshape(*y_train.shape, 1), batch_size=128, epochs=3,validation_split=0.1, verbose=1)




O            5034983
B-Symptom     352543
I-Symptom       5155
Name: label, dtype: int64
tag2idx {'B-Symptom': 0, 'O': 1, 'I-Symptom': 2}
idx2tag {0: 'B-Symptom', 1: 'O', 2: 'I-Symptom'}
(307983, 35)
(307983, 35)
(34221, 35)
(34221, 35)
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 35, 40)            8622840   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 35, 128)           53760     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 35, 3)             387       
Total params: 8,676,987
Trainable params: 8,676,987
Non-trainable params: 0
_________________________________________________________________
None
Train on 277184 samples, validate on 30799 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [76]:
y_hat = model.predict(x_test, verbose=1)
y_hat.shape




(34221, 35, 3)

In [77]:
import numpy
# import sklearn_crfsuite
def pred2label(pred):
  out = []
  for pred_i in pred:
    out_i = []
    for p in pred_i:
      p_i = numpy.argmax(p)
      out_i.append(idx2tag[p_i])
    out.append(out_i)
  return out

def test2label(pred):
  out = []
  for pred_i in pred:
    out_i = []
    for p in pred_i:
      out_i.append(idx2tag[p])
    out.append(out_i)
  return out
pred_labels = pred2label(y_hat)
test_labels = test2label(y_test)
from sklearn_crfsuite.metrics import flat_classification_report
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print(classification_report(test_labels, pred_labels))
print(flat_classification_report(test_labels, pred_labels))

           precision    recall  f1-score   support

  Symptom       0.99      0.99      0.99     35308

micro avg       0.99      0.99      0.99     35308
macro avg       0.99      0.99      0.99     35308

              precision    recall  f1-score   support

   B-Symptom       0.99      0.99      0.99     35308
   I-Symptom       0.95      0.81      0.87       502
           O       1.00      1.00      1.00   1161925

    accuracy                           1.00   1197735
   macro avg       0.98      0.93      0.95   1197735
weighted avg       1.00      1.00      1.00   1197735



In [78]:
import numpy as np
idx2tag = {i: w for w, i in tag2idx.items()}


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out
    
def test2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out
    
pred_labels = pred2label(y_hat)

test_labels = test2label(y_test)
print(classification_report(test_labels, pred_labels))
print(flat_classification_report(test_labels, pred_labels))
print(np.array(pred_labels).shape)
print(np.array(test_labels).shape)

count = 0
total = 0
for i in range(len(test_labels)):
  if test_labels[i] == pred_labels[i]:
    count += 1
  total += 1
print(" accuracy : ",count/total)


           precision    recall  f1-score   support

  Symptom       0.99      0.99      0.99     35308

micro avg       0.99      0.99      0.99     35308
macro avg       0.99      0.99      0.99     35308

              precision    recall  f1-score   support

   B-Symptom       0.99      0.99      0.99     35308
   I-Symptom       0.95      0.81      0.87       502
           O       1.00      1.00      1.00   1161925

    accuracy                           1.00   1197735
   macro avg       0.98      0.93      0.95   1197735
weighted avg       1.00      1.00      1.00   1197735

(34221, 35)
(34221, 35)
 accuracy :  0.9845416557084831


In [84]:
for i in range(34):
  for j in range(35):
    if test_labels[i][j] == 'B-Symptom' or test_labels[i][j] ==  'I-Symptom':
      print(test_labels[i][j], "               ", pred_labels[i][j])

B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
I-Symptom                 I-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-Symptom
B-Symptom                 B-

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [8]:
!ls '/content/gdrive/My Drive/updated_annotated_data.csv'

 1-s2.0-S1877042811014005-main.pdf
 20180818_174343.jpg
 47299154-Solution-Manual-Introduction-to-the-Theory-of-Computation-Sipser.pdf
 CADEC.csv
 cadec_twitter_bilastmcrf.ipynb
 eyedetails.pdf
 FeeRecieptSpring.pdf
'Getting started.pdf'
 github-git-cheat-sheet.pdf
'Inter IIT.pdf'
'linux system programming.pdf'
 OS_Assignment_1.pdf
'OTM_2019_paper_140 .pdf'
 Project_Scope.docx
 resume_12_oct_2019.pdf
'resume_12_oct_2019 - Souparna Das.pdf'
 resume.pdf
 resumes
 sc11-cuda-c-basics.pdf
'Scan Nov 17, 2018 (1).pdf'
'Scan Nov 17, 2018.pdf'
 SMAI_ASSIGNMENTS
'System Design Book.pdf'
 ticket6.pdf
 Ticketdurgapuja.pdf
 twoWheeler.pdf
'Untitled Diagram.drawio'
 updated_annotated_data.csv


In [16]:
data.shape

(5392681, 4)

In [28]:
data[0:10].groupby(['sec_no'])['word'].apply(list).to_list()

[['Happy',
  'Together',
  '4',
  '-',
  'SM',
  'Entertainment',
  'special',
  'guest',
  'lineup',
  'BoA']]

In [29]:
str('happy')

'happy'

In [37]:
pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)
[K     |████████████████████████████████| 757kB 3.9MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6


In [39]:
pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=21577c4214d8db9a5daa4e6c42472c15a05c458de339ab7ec76c48decdde5aa7
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [56]:
data['label'].value_counts()

O            5034983
B-Symptom     352543
I-Symptom       5155
Name: label, dtype: int64

In [60]:
y_test.shape

(34221, 35)

In [79]:
print(y_test[0:10])
print(y_hat[0:10])


[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
[[[5.9208405e-06 9.9991512e-01 7.8878613e-05]
  [8.4726998e-06 9.9720579e-01 2.7857295e-03]
  [1.2171252e-07 9.9998975e-01 1.0149465e-05]
  ...
  [3.9498153e-07 9.9999797e-01 1.6323543e-06]
  [5.1757013e-07 9.9999738e-01 2.1123112e-06]
  [6.5525052e-07 9.9999678e-01 2.