In [1]:
# ! python -m spacy download en_core_web_sm
import os
import csv
import time
import spacy
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from IPython.display import display, clear_output
from sklearn.feature_extraction.text import CountVectorizer
os.chdir(os.getcwd())

In [2]:
data = pd.read_csv('Data/SEM-2012-SharedTask-CD-SCO-dev-simple.v2.txt',
                   names=['document', 'sentence', 'token', 'word', 'negation'],
                   sep='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')

data

Unnamed: 0,document,sentence,token,word,negation
0,wisteria01,0,0,1.,O
1,wisteria01,0,1,The,O
2,wisteria01,0,2,Singular,O
3,wisteria01,0,3,Experience,O
4,wisteria01,0,4,of,O
...,...,...,...,...,...
13562,wisteria02,439,9,orthodox,O
13563,wisteria02,439,10,in,O
13564,wisteria02,439,11,his,O
13565,wisteria02,439,12,ritual,O


In [3]:
def construct_sentence(df):
    sentence = ''
    
    for token in df.word:
        if token in ['.', ',', '?', '!']:
            sentence += token
        else:
            sentence += ' ' + token
    
    return sentence[1:]

# Feature engineering

In [4]:
sentences = []
for document_id in data.document.unique():
    for sentence_id in data[data.document == document_id].sentence.unique():
        sentence = construct_sentence(data[(data.document == document_id) & (data.sentence == sentence_id)])
        sentences.append(sentence)

In [5]:
num_sentences = len(sentences)
nlp = spacy.load('en_core_web_sm')
features = pd.DataFrame(columns=['word', 'lemma', 'pos', 'tag', 'dep', 'shape', 'is_alpha', 'is_stop'])

start_time = time.time()
for i, sentence in enumerate(sentences):
    print(f'\rSentence: {i+1}/{num_sentences} \t Elapsed seconds: {int(time.time()-start_time)}', end='')
    
    doc = nlp(sentence)
    
    for token in doc:
        features = features.append({'word': token.text,
                                    'lemma': token.lemma_,
                                    'pos': token.pos_,
                                    'tag': token.tag_,
                                    'dep': token.dep_,
                                    'shape': token.shape_,
                                    'is_alpha': int(token.is_alpha),
                                    'is_stop': int(token.is_stop)}, ignore_index=True)

features

Sentence: 787/787 	 Elapsed seconds: 32

Unnamed: 0,word,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,1,1,X,LS,ROOT,d,0,0
1,.,.,PUNCT,.,punct,.,0,0
2,The,the,DET,DT,det,Xxx,1,1
3,Singular,Singular,PROPN,NNP,compound,Xxxxx,1,0
4,Experience,Experience,PROPN,NNP,ROOT,Xxxxx,1,0
...,...,...,...,...,...,...,...,...
13942,orthodox,orthodox,ADJ,JJ,acomp,xxxx,1,0
13943,in,in,ADP,IN,prep,xx,1,1
13944,his,his,PRON,PRP$,poss,xxx,1,1
13945,ritual,ritual,NOUN,NN,pobj,xxxx,1,0


## Adjust the feature set to conform with the training format

In [7]:
saved_features = features.copy()
# features.to_csv('features_per_sentence_val_data.csv', index=False)

In [8]:
features = saved_features.copy()
features.drop(1, inplace=True)
features.reset_index(drop=True, inplace=True)

def dot_line(i):
    return pd.DataFrame({'word': '.', 'lemma': '.', 'pos': 'PUNCT', 'tag': '.',
                         'dep': 'punct', 'shape': '.', 'is_alpha': 0, 'is_stop': 0}, index=[i])

for i in range(len(features)-1, 0, -1):
    features.reset_index(drop=True, inplace=True)
    
    if i not in features.index:
        continue
    
    elif features.word[i] == '`' and features.word[i+1] == '`':
        features.word[i] = '``'
        features.lemma[i] = '``'
        features.drop(i+1, inplace=True)
    
    elif features.word[i] == '.' and features.word[i-1] in [str(integer) for integer in range(100)]:
        features.drop(i, inplace=True)
    
    elif features.word[i] == '-':
        features.loc[i-1, 'word'] = features.loc[i-1, 'word'] + features.loc[i, 'word'] + features.loc[i+1, 'word']
        features.drop([i, i+1], inplace=True)
    
    elif features.word[i] == "'" and features.word[i+1] in ['ve', 'm']:
        features.word[i+1] += features.word[i+1]
        features.lemma[i+1] += features.word[i+1]
        features.drop(i, inplace=True)

    elif features.word[i] == 'No' and features.word[i+1] == '.':
        try:
            if int(features.word[i+2]) > 0:
                features.word[i] = 'No.'
                features.drop(i+1, inplace=True)
        except:
            pass
    
    elif features.word[i] == "'":
        try:
            if int(features.word[i+1]) > 0:
                features.drop(i, inplace=True)
        except:
            pass
    
    elif features.word[i] in ['I.', 'D.'] and features.pos[i+1] == 'PUNCT':
        features.word[i] = features.word[i][0]
        features.loc[i, 'shape'] = 'X'
        features = features.append(dot_line(i+0.5), ignore_index=False)
        features = features.sort_index()

features = features.append(dot_line(1672.5), ignore_index=False)
features = features.append(dot_line(11435.5), ignore_index=False)
features = features.sort_index()

features.reset_index(drop=True, inplace=True)

In [10]:
x = 10000
y = 10
features.word[x:x+y]

10000     would
10001    entail
10002         .
10003         I
10004       may
10005       add
10006      that
10007      Miss
10008    Burnet
10009        's
Name: word, dtype: object

In [11]:
data.word[x:x+y]

10000     would
10001    entail
10002         .
10003         I
10004       may
10005       add
10006      that
10007      Miss
10008    Burnet
10009        's
Name: word, dtype: object

In [12]:
len(features), len(data)

(13567, 13567)

In [13]:
np.sum(features.word != data.word)

18

## Trigrams

In [14]:
features['trigram'] = features.word.shift() + ' ' +  features.word + ' ' + features.word.shift(-1)
features.loc[0, 'trigram'] = features.trigram[1]
features.loc[len(features)-1, 'trigram'] = features.trigram[len(features)-2]

In [15]:
features

Unnamed: 0,word,lemma,pos,tag,dep,shape,is_alpha,is_stop,trigram
0,1,1,X,LS,ROOT,d,0,0,1 The Singular
1,The,the,DET,DT,det,Xxx,1,1,1 The Singular
2,Singular,Singular,PROPN,NNP,compound,Xxxxx,1,0,The Singular Experience
3,Experience,Experience,PROPN,NNP,ROOT,Xxxxx,1,0,Singular Experience of
4,of,of,ADP,IN,prep,xx,1,1,Experience of Mr.
...,...,...,...,...,...,...,...,...,...
13562,orthodox,orthodox,ADJ,JJ,acomp,xxxx,1,0,very orthodox in
13563,in,in,ADP,IN,prep,xx,1,1,orthodox in his
13564,his,his,PRON,PRP$,poss,xxx,1,1,in his ritual
13565,ritual,ritual,NOUN,NN,pobj,xxxx,1,0,his ritual .


In [16]:
data['lemma'] = features.lemma
data['pos'] = features.pos
data['tag'] = features.tag
data['dependency'] = features.dep
data['shape_'] = features['shape']
data['is_alpha'] = features.is_alpha
data['is_stop'] = features.is_stop
data['trigram'] = features.trigram

data = data[['document', 'sentence', 'token', 'word', 'lemma', 'pos', 'tag',
             'dependency', 'shape_', 'is_alpha', 'is_stop', 'trigram', 'negation']]

data

Unnamed: 0,document,sentence,token,word,lemma,pos,tag,dependency,shape_,is_alpha,is_stop,trigram,negation
0,wisteria01,0,0,1.,1,X,LS,ROOT,d,0,0,1 The Singular,O
1,wisteria01,0,1,The,the,DET,DT,det,Xxx,1,1,1 The Singular,O
2,wisteria01,0,2,Singular,Singular,PROPN,NNP,compound,Xxxxx,1,0,The Singular Experience,O
3,wisteria01,0,3,Experience,Experience,PROPN,NNP,ROOT,Xxxxx,1,0,Singular Experience of,O
4,wisteria01,0,4,of,of,ADP,IN,prep,xx,1,1,Experience of Mr.,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13562,wisteria02,439,9,orthodox,orthodox,ADJ,JJ,acomp,xxxx,1,0,very orthodox in,O
13563,wisteria02,439,10,in,in,ADP,IN,prep,xx,1,1,orthodox in his,O
13564,wisteria02,439,11,his,his,PRON,PRP$,poss,xxx,1,1,in his ritual,O
13565,wisteria02,439,12,ritual,ritual,NOUN,NN,pobj,xxxx,1,0,his ritual .,O


In [17]:
data.to_csv('val_data_features_alligned.csv', index=False)