In [1]:
import pandas as pd
import numpy as np 
import sys

import gensim.downloader as api
import nltk
from gensim.models.word2vec import Word2Vec
from nltk.corpus import stopwords
from nltk.tag.stanford import StanfordNERTagger
import tensorflow as tf
import transformers
from transformers import BertTokenizer

split_df=pd.read_csv('annotation_raw.csv')

In [2]:
# pos tag
def get_pos(tokens):
    pos_tags_raw=nltk.pos_tag(tokens)
    pos_tags=list(map(lambda x:x[1],pos_tags_raw))
    return pos_tags

# stop word
def get_stop_word(tokens):
    stop_words_list=stopwords.words('english')
    stop_words=list(map(lambda x:x in stop_words_list,tokens))
    return stop_words

# NER
# https://nlp.stanford.edu/software/CRF-NER.shtml
def get_ner(tokens):
    model_path='stanford_ner\\english.muc.7class.distsim.crf.ser.gz'
    # model_path='stanford_ner\\english.all.3class.distsim.crf.ser.gz'
    jar_path='stanford_ner\\stanford-ner.jar'
    tagger = StanfordNERTagger(model_path,jar_path,encoding='utf-8')
    # tagger = StanfordNERTagger(model_path,encoding='utf-8')
    tagged = tagger.tag(tokens)
    tagged=list(map(lambda x:x[1],tagged))
    return tagged

# upper/lower case information, acronyms, punctuation marks, etc.
# % of upper case (or number)
def get_syntactic(tokens):
    return list(map(lambda x:sum(np.array(list(x.upper()))==np.array(list(x)))/len(x),tokens))

# word2vec or bert representation
def get_word2vec(tokens):
    res=[]
    for token in tokens:
        try:wt=w2v_model.wv.get_vector(token)
        except:wt=np.zeros(100)
        res.append(wt)
    return np.array(res)

# bert representation
def load_bert():
    input_ids = tf.keras.layers.Input(shape=(3), dtype=tf.int32, name="input_ids")

    embedding = transformers.TFBertModel.from_pretrained("cambridgeltl/BioRedditBERT-uncased")
    out = embedding(input_ids)

    model = tf.keras.models.Model(inputs=[input_ids], outputs=out[0])
    model.compile(optimizer=tf.keras.optimizers.Adam(),loss="categorical_crossentropy",metrics=["acc"],)
    return model

def get_bert(tokens):
    def tokenize_data(data):
        tokenizer = BertTokenizer.from_pretrained("cambridgeltl/BioRedditBERT-uncased")
        encoded = tokenizer.batch_encode_plus(
            data,
            add_special_tokens=True,
            max_length=3,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        return np.array(encoded["input_ids"], dtype="int32")

    token_bert=tokenize_data(tokens)
    emb=bert_model.predict(token_bert)
    return emb.mean(axis=1)

In [2]:
%%time
tokens=split_df.Word.values

split_df['pos']=get_pos(tokens)
split_df['stop_word']=get_stop_word(tokens)
# split_df['ner']=get_ner(tokens)
split_df['synt']=get_syntactic(tokens)
tokens_lower=list(map(lambda x:x.lower(),tokens))

# embeddings
corpus = api.load('text8') 
w2v_model = Word2Vec(corpus)  # time consuming
temp=get_word2vec(tokens_lower)
for i in range(temp.shape[1]):
    split_df['word_2vec_%s'%(i+1)]=temp[:,i]

bert_model=load_bert()
temp=get_bert(tokens)
for i in range(temp.shape[1]):
    split_df['bert_%s'%(i+1)]=temp[:,i]

split_df.to_csv('annotations.csv',index=False)
