In [1]:
# ! python -m spacy download en_core_web_sm
import os
import csv
import time
import spacy
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from IPython.display import display, clear_output
from sklearn.feature_extraction.text import CountVectorizer
os.chdir(os.getcwd())

In [2]:
data = {}
data['cardboard'] = pd.read_csv('Data/cardboard_data.txt',
                                names=['document', 'sentence', 'token', 'word', 'negation'],
                                sep='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')
data['circle'] = pd.read_csv('Data/circle_data.txt',
                                names=['document', 'sentence', 'token', 'word', 'negation'],
                                sep='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')

data = data['cardboard'].append(data['circle'])
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,document,sentence,token,word,negation
0,cardboard,0,0,In,O
1,cardboard,0,1,choosing,O
2,cardboard,0,2,a,O
3,cardboard,0,3,few,O
4,cardboard,0,4,typical,O
...,...,...,...,...,...
19211,circle02,221,10,the,O
19212,circle02,221,11,second,O
19213,circle02,221,12,act,O
19214,circle02,221,13,.,O


In [3]:
def construct_sentence(df):
    sentence = ''
    
    for token in df.word:
        if token in ['.', ',', '?', '!']:
            sentence += token
        else:
            sentence += ' ' + token
    
    return sentence[1:]

# Feature engineering

In [4]:
sentences = []
for document_id in data.document.unique():
    for sentence_id in data[data.document == document_id].sentence.unique():
        sentence = construct_sentence(data[(data.document == document_id) & (data.sentence == sentence_id)])
        sentences.append(sentence)

In [5]:
num_sentences = len(sentences)
nlp = spacy.load('en_core_web_sm')
features = pd.DataFrame(columns=['word', 'lemma', 'pos', 'tag', 'dep', 'shape', 'is_alpha', 'is_stop'])

start_time = time.time()
for i, sentence in enumerate(sentences):
    print(f'\rSentence: {i+1}/{num_sentences} \t Elapsed seconds: {int(time.time()-start_time)}', end='')
    
    doc = nlp(sentence)
    
    for token in doc:
        features = features.append({'word': token.text,
                                    'lemma': token.lemma_,
                                    'pos': token.pos_,
                                    'tag': token.tag_,
                                    'dep': token.dep_,
                                    'shape': token.shape_,
                                    'is_alpha': int(token.is_alpha),
                                    'is_stop': int(token.is_stop)}, ignore_index=True)

features

Sentence: 1089/1089 	 Elapsed seconds: 52

Unnamed: 0,word,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,In,in,ADP,IN,prep,Xx,1,1
1,choosing,choose,VERB,VBG,pcomp,xxxx,1,0
2,a,a,DET,DT,quantmod,x,1,1
3,few,few,ADJ,JJ,nummod,xxx,1,1
4,typical,typical,ADJ,JJ,amod,xxxx,1,0
...,...,...,...,...,...,...,...,...
19724,the,the,DET,DT,det,xxx,1,1
19725,second,second,ADJ,JJ,amod,xxxx,1,0
19726,act,act,NOUN,NN,pobj,xxx,1,0
19727,.,.,PUNCT,.,punct,.,0,0


## Adjust the feature set to conform with the training format

In [6]:
saved_features = features.copy()
# features.to_csv('features_per_sentence_test_data.csv', index=False)

In [7]:
features = saved_features.copy()

def dot_line(i):
    return pd.DataFrame({'word': '.', 'lemma': '.', 'pos': 'PUNCT', 'tag': '.',
                         'dep': 'punct', 'shape': '.', 'is_alpha': 0, 'is_stop': 0}, index=[i])

for i in range(len(features)-1, 0, -1):
    features.reset_index(drop=True, inplace=True)
    
    if i not in features.index:
        continue
    
    elif features.word[i] == '`' and features.word[i+1] == '`':
        features.word[i] = '``'
        features.lemma[i] = '``'
        features.drop(i+1, inplace=True)
    
    elif features.word[i] == '.' and features.word[i-1] in [str(integer) for integer in range(100)]:
        features.drop(i, inplace=True)
    
    elif features.word[i] == '-':
        features.loc[i-1, 'word'] = features.loc[i-1, 'word'] + features.loc[i, 'word'] + features.loc[i+1, 'word']
        features.drop([i, i+1], inplace=True)
    
    elif features.word[i] == "'" and features.word[i+1] in ['ve', 'm']:
        features.word[i+1] += features.word[i+1]
        features.lemma[i+1] += features.word[i+1]
        features.drop(i, inplace=True)

    elif features.word[i] == 'No' and features.word[i+1] == '.':
        try:
            if int(features.word[i+2]) > 0:
                features.word[i] = 'No.'
                features.drop(i+1, inplace=True)
        except:
            pass
    
    elif features.word[i] in ['I.', 'D.', 'T.', 'G.'] and features.pos[i+1] == 'PUNCT':
        features.word[i] = features.word[i][0]
        features.loc[i, 'shape'] = 'X'
        features = features.append(dot_line(i+0.5), ignore_index=False)
        features = features.sort_index()
    
    elif features.word[i] == "'" and features.word[i+1] in ['It', 'Well']:
        features.word[i+1] = "'" + features.word[i+1]
        features.drop(i, inplace=True)

features = features.append(dot_line(14583.5), ignore_index=False)
features = features.append(dot_line(14587.5), ignore_index=False)
features = features.append(dot_line(14598.5), ignore_index=False)
features = features.sort_index()

features.reset_index(drop=True, inplace=True)

In [11]:
x = 15000
y = 10
features.word[x:x+y]

15000        the
15001    instant
15002         to
15003         us
15004       both
15005          .
15006     Holmes
15007     sprang
15008         up
15009       from
Name: word, dtype: object

In [12]:
data.word[x:x+y]

15000        the
15001    instant
15002         to
15003         us
15004       both
15005          .
15006     Holmes
15007     sprang
15008         up
15009       from
Name: word, dtype: object

In [13]:
len(features), len(data)

(19216, 19216)

In [14]:
np.sum(features.word != data.word)

52

## Trigrams

In [15]:
features['trigram'] = features.word.shift() + ' ' +  features.word + ' ' + features.word.shift(-1)
features.loc[0, 'trigram'] = features.trigram[1]
features.loc[len(features)-1, 'trigram'] = features.trigram[len(features)-2]

In [16]:
features

Unnamed: 0,word,lemma,pos,tag,dep,shape,is_alpha,is_stop,trigram
0,In,in,ADP,IN,prep,Xx,1,1,In choosing a
1,choosing,choose,VERB,VBG,pcomp,xxxx,1,0,In choosing a
2,a,a,DET,DT,quantmod,x,1,1,choosing a few
3,few,few,ADJ,JJ,nummod,xxx,1,1,a few typical
4,typical,typical,ADJ,JJ,amod,xxxx,1,0,few typical cases
...,...,...,...,...,...,...,...,...,...
19211,the,the,DET,DT,det,xxx,1,1,for the second
19212,second,second,ADJ,JJ,amod,xxxx,1,0,the second act
19213,act,act,NOUN,NN,pobj,xxx,1,0,second act .
19214,.,.,PUNCT,.,punct,.,0,0,act . ''


In [18]:
data['lemma'] = features.lemma
data['pos'] = features.pos
data['tag'] = features.tag
data['dependency'] = features.dep
data['shape_'] = features['shape']
data['is_alpha'] = features.is_alpha
data['is_stop'] = features.is_stop
data['trigram'] = features.trigram

data = data[['document', 'sentence', 'token', 'word', 'lemma', 'pos', 'tag',
             'dependency', 'shape_', 'is_alpha', 'is_stop', 'trigram', 'negation']]

data

Unnamed: 0,document,sentence,token,word,lemma,pos,tag,dependency,shape_,is_alpha,is_stop,trigram,negation
0,cardboard,0,0,In,in,ADP,IN,prep,Xx,1,1,In choosing a,O
1,cardboard,0,1,choosing,choose,VERB,VBG,pcomp,xxxx,1,0,In choosing a,O
2,cardboard,0,2,a,a,DET,DT,quantmod,x,1,1,choosing a few,O
3,cardboard,0,3,few,few,ADJ,JJ,nummod,xxx,1,1,a few typical,O
4,cardboard,0,4,typical,typical,ADJ,JJ,amod,xxxx,1,0,few typical cases,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19211,circle02,221,10,the,the,DET,DT,det,xxx,1,1,for the second,O
19212,circle02,221,11,second,second,ADJ,JJ,amod,xxxx,1,0,the second act,O
19213,circle02,221,12,act,act,NOUN,NN,pobj,xxx,1,0,second act .,O
19214,circle02,221,13,.,.,PUNCT,.,punct,.,0,0,act . '',O


In [19]:
# data.to_csv('test_data_features_alligned.csv', index=False)