## testing spacy

In [5]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
sample="When Alex started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
sample2="Kunarasa was born in 1941 in Jaffna and died on 28 February 2016"

In [8]:
doc = nlp(sample)

In [9]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Alex 5 9 PERSON
Google 50 56 ORG
2007 60 64 DATE


In [1]:
! pip install ktrain

^C


In [2]:
import ktrain
from ktrain import text

Using TensorFlow backend.


using Keras version: 2.2.4


## Read the input json file

In [14]:
import json
with open("documents_train.json","r") as read_file:
    data = json.load(read_file)

In [15]:
myData=[]
for document in data:
    text=document['text']
    H0=document['H0']
    H1=document['H1']
    i=0
    sentence=""
    categ="None"
    entities=[]
    while(i<len(text)):
        if(len(sentence)==0):
            start_index=0
        else:
            start_index=len(sentence)
        if(H0[i]==1):
            entities.append([start_index,start_index+len(text[i]),"H0"])
            if(categ!="H0"):
                categ='H0'
        if(H1[i]==1 and H0[i]==0):
            entities.append([start_index,start_index+len(text[i]),"H1"])
            if(categ!="H1" and categ!="H0"):
                categ='H1'
        sentence=sentence+" "+text[i]
        if('.' in text[i]):
            myData.append([sentence,categ,entities])
            sentence=""
            categ="None"
            entities=[]
        i+=1
            
    

In [16]:
myData

[[' Isaac David Abella (June 20, 1934 – October 23, 2016) was a Professor of Physics at The University of Chicago.',
  'H0',
  [[0, 5, 'H0'],
   [6, 11, 'H0'],
   [12, 18, 'H0'],
   [25, 28, 'H1'],
   [29, 33, 'H1'],
   [34, 35, 'H1'],
   [36, 43, 'H1'],
   [44, 47, 'H1'],
   [48, 53, 'H1'],
   [54, 57, 'H1'],
   [60, 69, 'H1'],
   [70, 72, 'H1'],
   [73, 80, 'H1'],
   [81, 83, 'H1'],
   [84, 87, 'H1'],
   [88, 98, 'H1'],
   [99, 101, 'H1'],
   [102, 110, 'H1']]],
 [' He specialized in laser physics, quantum optics, and spectroscopy.',
  'None',
  []],
 [' Isaac was the cousin of Irving Abella.',
  'H0',
  [[0, 5, 'H0'], [24, 30, 'H0'], [31, 38, 'H0']]],
 [' Isaac Abella was born on June 20, 1934 in Toronto, Ontario.',
  'H0',
  [[0, 5, 'H0'],
   [6, 12, 'H0'],
   [25, 29, 'H1'],
   [30, 33, 'H1'],
   [34, 38, 'H1'],
   [42, 50, 'H1'],
   [51, 59, 'H1']]],
 [' Abella received his Bachelor of Arts degree (1957) from the University of Toronto, Master of Arts (1959) degree, and Ph.',
  'H

In [17]:
import pandas as pd
my_df = pd.DataFrame(myData)
my_df.columns=['Sentence', 'Category', 'Spacy']

In [18]:
my_df.to_csv('sent_classifier_train.csv', index=False)

## Building the Sentence Classification model

We will be using Kashgiri library to build our model and will be evaluating its performance against multiple algorithms

In [1]:
import kashgari

In [2]:
from kashgari.corpus import SMP2018ECDTCorpus

In [3]:
#Importing libraries to create data to be used for training
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
set(stopwords.words('english'))
import string
table = str.maketrans(dict.fromkeys(string.punctuation)) #Used to remove the punctuations

In [26]:
data = pd.read_csv("sent_classifier_train.csv", index_col=False)
X= data.Sentence
Y=data.Category
print(X.shape)
print(Y.shape)

(1427,)
(1427,)


In [31]:
stop_words = set(stopwords.words('english')) 
X_final=[]
y_final=[]
for item,label in zip(X,Y):
    sentence=item.lower()
    #Remove punctuations
    new_sentence=sentence.translate(table)
    #Tokenize the transformed sentence
    word_tokens=word_tokenize(new_sentence)
    sample=[w for w in word_tokens if not w in stop_words]
    if(len(sample)>0):
        X_final.append(sample)
        y_final.append(label)
        

In [35]:
y_final[:5]

['H0', 'None', 'H0', 'H0', 'H0']

In [37]:
df_X = pd.DataFrame({'Tokenized_Examples': X_final})
df_Y=pd.DataFrame({'Labels': y_final})
print(df_X.shape)
print(df_Y.shape)

(1381, 1)
(1381, 1)


In [44]:
#Split data into training, testing, and  in 60:20:20 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [45]:
import kashgari
from kashgari.tasks.classification import BiLSTM_Model
import logging
logging.basicConfig(level='DEBUG')

In [46]:
#Convert to format required by BERT classifier
X_train=X_train.Tokenized_Examples.tolist()
X_val=X_val.Tokenized_Examples.tolist()
y_train=y_train.Labels.tolist()
y_val=y_val.Labels.tolist()
X_test=X_test.Tokenized_Examples.tolist()
y_test=y_test.Labels.tolist()

In [None]:
import kashgari
from kashgari.tasks.classification import BiGRU_Model
from kashgari.embeddings import BERTEmbedding

import logging
logging.basicConfig(level='DEBUG')
# "D:\Berkeley\1stSem\290\Project\BERT" has the vocab.txt file with pre trained embeddings
bert_embed = BERTEmbedding('BERT',
                           task=kashgari.CLASSIFICATION,
                           sequence_length=100)
model = BiGRU_Model(bert_embed)
model.fit(X_train, y_train, X_val, y_val)

W1126 15:02:48.405586 16692 bert_embedding.py:126] seq_len: 100


Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

Epoch 1/5
 3/14 [=====>........................] - ETA: 4:24 - loss: 2.4992 - acc: 0.4167

In [216]:
# Evaluate the model
X_test=X_test.Tokenized_Examples.tolist()
y_test=y_test.Labels.tolist()
model.evaluate(X_test_list, y_test)

              precision    recall  f1-score   support

          H0     0.8741    0.8613    0.8676       137
          H1     0.7143    0.6429    0.6767        70
        None     0.7159    0.7975    0.7545        79

    accuracy                         0.7902       286
   macro avg     0.7681    0.7672    0.7663       286
weighted avg     0.7913    0.7902    0.7897       286

