In [1]:
# ! python -m spacy download en_core_web_sm
import os
import csv
import time
import spacy
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from IPython.display import display, clear_output
from sklearn.feature_extraction.text import CountVectorizer
os.chdir(os.getcwd())

In [2]:
data = pd.read_csv('Data/SEM-2012-SharedTask-CD-SCO-training-simple.v2.txt',
                   names=['document', 'sentence', 'token', 'word', 'negation'],
                   sep='\t', quoting=csv.QUOTE_NONE, encoding='utf-8')

data

Unnamed: 0,document,sentence,token,word,negation
0,baskervilles01,0,0,Chapter,O
1,baskervilles01,0,1,1.,O
2,baskervilles01,0,2,Mr.,O
3,baskervilles01,0,3,Sherlock,O
4,baskervilles01,0,4,Holmes,O
...,...,...,...,...,...
65446,baskervilles14,270,58,slopes,O
65447,baskervilles14,270,59,of,O
65448,baskervilles14,270,60,the,O
65449,baskervilles14,270,61,moor,O


In [3]:
def construct_sentence(df):
    sentence = ''
    
    for token in df.word:
        if token in ['.', ',', '?', '!']:
            sentence += token
        else:
            sentence += ' ' + token
    
    return sentence[1:]

# Feature engineering

In [4]:
sentences = []
for document_id in data.document.unique():
    for sentence_id in data[data.document == document_id].sentence.unique():
        sentence = construct_sentence(data[(data.document == document_id) & (data.sentence == sentence_id)])
        sentences.append(sentence)

In [5]:
num_sentences = len(sentences)
nlp = spacy.load('en_core_web_sm')
features = pd.DataFrame(columns=['word', 'lemma', 'pos', 'tag', 'dep', 'shape', 'is_alpha', 'is_stop'])

start_time = time.time()
for i, sentence in enumerate(sentences):
    print(f'\rSentence: {i+1}/{num_sentences} \t Elapsed seconds: {int(time.time()-start_time)}', end='')
    
    doc = nlp(sentence)
    
    for token in doc:
        features = features.append({'word': token.text,
                                    'lemma': token.lemma_,
                                    'pos': token.pos_,
                                    'tag': token.tag_,
                                    'dep': token.dep_,
                                    'shape': token.shape_,
                                    'is_alpha': int(token.is_alpha),
                                    'is_stop': int(token.is_stop)}, ignore_index=True)

features

Sentence: 3644/3644 	 Elapsed seconds: 405

Unnamed: 0,word,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,Chapter,chapter,NOUN,NN,ROOT,Xxxxx,1,0
1,1,1,NUM,CD,nummod,d,0,0
2,.,.,PUNCT,.,punct,.,0,0
3,Mr.,Mr.,PROPN,NNP,compound,Xx.,0,0
4,Sherlock,Sherlock,PROPN,NNP,compound,Xxxxx,1,0
...,...,...,...,...,...,...,...,...
67338,slopes,slope,NOUN,NNS,pobj,xxxx,1,0
67339,of,of,ADP,IN,prep,xx,1,1
67340,the,the,DET,DT,det,xxx,1,1
67341,moor,moor,NOUN,NN,pobj,xxxx,1,0


## Adjust the feature set to conform with the training format

In [12]:
saved_features = features.copy()
# features.to_csv('features_per_sentence_train_data.csv', index=False)

In [181]:
features = saved_features.copy()

def dot_line(i):
    return pd.DataFrame({'word': '.', 'lemma': '.', 'pos': 'PUNCT', 'tag': '.',
                         'dep': 'punct', 'shape': '.', 'is_alpha': 0, 'is_stop': 0}, index=[i])

for i in range(len(features)-1, 1, -1):
    features.reset_index(drop=True, inplace=True)
    
    if i not in features.index:
        continue
    
    elif features.word[i] == '`' and features.word[i+1] == '`':
        features.word[i] = '``'
        features.lemma[i] = '``'
        features.drop(i+1, inplace=True)
    
    elif features.word[i] == '.' and features.word[i-1] in [str(integer) for integer in range(100)]:
        features.drop(i, inplace=True)
    
    elif features.word[i] == '-':
        features.loc[i-1, 'word'] = features.loc[i-1, 'word'] + features.loc[i, 'word'] + features.loc[i+1, 'word']
        features.drop([i, i+1], inplace=True)
    
    elif features.word[i] == "'" and features.word[i+1] in ['ve', 'm']:
        features.word[i+1] += features.word[i+1]
        features.lemma[i+1] += features.word[i+1]
        features.drop(i, inplace=True)

    elif features.word[i] == 'No' and features.word[i+1] == '.':
        try:
            if int(features.word[i+2]) > 0:
                features.word[i] = 'No.'
                features.drop(i+1, inplace=True)
        except:
            pass
    
    elif features.word[i] == "'":
        try:
            if int(features.word[i+1]) > 0:
                features.drop(i, inplace=True)
        except:
            pass

features.reset_index(drop=True, inplace=True)

In [186]:
x = 65445
y = 10
features.word[x:x+y]

65445    russet
65446    slopes
65447        of
65448       the
65449      moor
65450         .
Name: word, dtype: object

In [187]:
data.word[x:x+y]

65445    russet
65446    slopes
65447        of
65448       the
65449      moor
65450         .
Name: word, dtype: object

In [188]:
len(features), len(data)

(65451, 65451)

In [189]:
np.sum(features.word != data.word)

48

## Trigrams

In [11]:
# def get_Ngrams(sentences, n):

#     stop_words = stopwords.words('english')
#     c_vec = CountVectorizer(stop_words=stop_words, ngram_range=(n,n))

#     ngrams = c_vec.fit_transform(sentences)
#     count_values = ngrams.toarray().sum(axis=0)
#     vocabulary = c_vec.vocabulary_

#     df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocabulary.items()],
#                                    reverse=True)).rename(columns={0: 'frequency', 1: 'n-grams'})
#     return df_ngram

# bigrams = get_Ngrams(sentences, 2)
# trigrams = get_Ngrams(sentences, 3)

In [190]:
features['trigram'] = features.word.shift() + ' ' +  features.word + ' ' + features.word.shift(-1)
features.loc[0, 'trigram'] = features.trigram[1]
features.loc[len(features)-1, 'trigram'] = features.trigram[len(features)-2]

In [191]:
features

Unnamed: 0,word,lemma,pos,tag,dep,shape,is_alpha,is_stop,trigram
0,Chapter,chapter,NOUN,NN,ROOT,Xxxxx,1,0,Chapter 1 Mr.
1,1,1,NUM,CD,nummod,d,0,0,Chapter 1 Mr.
2,Mr.,Mr.,PROPN,NNP,compound,Xx.,0,0,1 Mr. Sherlock
3,Sherlock,Sherlock,PROPN,NNP,compound,Xxxxx,1,0,Mr. Sherlock Holmes
4,Holmes,Holmes,PROPN,NNP,ROOT,Xxxxx,1,0,Sherlock Holmes Mr.
...,...,...,...,...,...,...,...,...,...
65446,slopes,slope,NOUN,NNS,pobj,xxxx,1,0,russet slopes of
65447,of,of,ADP,IN,prep,xx,1,1,slopes of the
65448,the,the,DET,DT,det,xxx,1,1,of the moor
65449,moor,moor,NOUN,NN,pobj,xxxx,1,0,the moor .


In [198]:
data['lemma'] = features.lemma
data['pos'] = features.pos
data['tag'] = features.tag
data['dependency'] = features.dep
data['shape_'] = features['shape']
data['is_alpha'] = features.is_alpha
data['is_stop'] = features.is_stop
data['trigram'] = features.trigram

data = data[['document', 'sentence', 'token', 'word', 'lemma', 'pos', 'tag',
             'dependency', 'shape_', 'is_alpha', 'is_stop', 'trigram', 'negation']]

data

Unnamed: 0,document,sentence,token,word,lemma,pos,tag,dependency,shape_,is_alpha,is_stop,trigram,negation
0,baskervilles01,0,0,Chapter,chapter,NOUN,NN,ROOT,Xxxxx,1,0,Chapter 1 Mr.,O
1,baskervilles01,0,1,1.,1,NUM,CD,nummod,d,0,0,Chapter 1 Mr.,O
2,baskervilles01,0,2,Mr.,Mr.,PROPN,NNP,compound,Xx.,0,0,1 Mr. Sherlock,O
3,baskervilles01,0,3,Sherlock,Sherlock,PROPN,NNP,compound,Xxxxx,1,0,Mr. Sherlock Holmes,O
4,baskervilles01,0,4,Holmes,Holmes,PROPN,NNP,ROOT,Xxxxx,1,0,Sherlock Holmes Mr.,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65446,baskervilles14,270,58,slopes,slope,NOUN,NNS,pobj,xxxx,1,0,russet slopes of,O
65447,baskervilles14,270,59,of,of,ADP,IN,prep,xx,1,1,slopes of the,O
65448,baskervilles14,270,60,the,the,DET,DT,det,xxx,1,1,of the moor,O
65449,baskervilles14,270,61,moor,moor,NOUN,NN,pobj,xxxx,1,0,the moor .,O


In [199]:
# data.to_csv('train_data_features_alligned.csv', index=False)