In [77]:
import pandas as pd
import numpy as np
import re
import nltk
import re, nltk
from nltk import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
import tensorflow_hub as hub
import dill

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\meet1\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meet1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
base = '../'

In [7]:
# Shaney Features

def mark_negation(sentence):
        negation = r"""(?:^(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt
        |dont|doesnt|didnt|isnt|arent|aint)$)|n't """
        neg_re = re.compile(negation, re.VERBOSE)
        punctuation = r"^[.:;!?]$"
        puncts = re.compile(punctuation)

        doc = word_tokenize(sentence)
        neg_scope = False
        count = 0
        for i, word in enumerate(doc):
            if neg_re.search(word):
                if not neg_scope:
                    neg_scope = not neg_scope
                    continue
                else:
                    doc[i] += "_NEG"
                    count += 1
            elif neg_scope and puncts.search(word):
                neg_scope = not neg_scope
            elif neg_scope and not puncts.search(word):
                doc[i] += "_NEG"
                count += 1
        return count

# Bing lui lexicons - no. of positive & negative words in a tweet.
def bing_lui(tweet):
    positive_words = 0
    negative_words = 0
    positive_minus_negative = 0
    with open(base + 'Lex/bing lui lexicon/positive-words.txt') as file:
        contents = file.read()
        for word in tweet.split():
            if word in contents:
                positive_words += 1
    with open(base + 'Lex/bing lui lexicon/negative-words.txt') as file:
        contents = file.read()
        for word in tweet.split():
            if word in contents:
                negative_words += 1

    positive_minus_negative = positive_words - negative_words
    lexicon_vector = [positive_words, negative_words, positive_minus_negative]
    return lexicon_vector

def polarity(sentence) :
    sid = SentimentIntensityAnalyzer()
    pol = sid.polarity_scores(sentence)
    return [pol['pos'], pol['neg'], pol['neu']]

def get_shaney_features(sentence):
    negation_words = mark_negation(sentence)
    bingLui = bing_lui(sentence)
    polari = polarity(sentence)
    return np.array([negation_words] + bingLui + polari).reshape((1,-1))

In [23]:
# Sriza Features

"""##No of URLS"""

def no_of_URLs(listoftweets):
    count=[]
    for tweet in listoftweets:
        l=re.findall(r'((www\.[\S]+)|(https?:\/\/[\S]+))',tweet)
        #print(l)
        count.append(len(l))
    return np.array(count);


"""##No of Hashtags"""

def no_of_hashtags(listoftweets):
    count=[]
    for tweet in listoftweets:
        l=re.findall(r'#(\w+)',tweet)
        #print(l)
        count.append(len(l))
    return np.array(count);


"""##Lexicon sentiment of hastags"""

def aggregatepolarityscores_hashtags(listoftweets):
    with open(base + 'Lex/unigrams-pmilexiconNRC_HashtagsSentiment.txt', 'r') as document:
        hashtagscore = {}
        for line in document:
            line = line.split()
            if(line[0][0]=='@'):
                continue
            hashtagscore[line[0]] = float(line[1:][0])

    vector=[]
    for tweet in listoftweets:
        tweet=tweet.split(' ')
        #print(tweet)
        
        val1=0
        val=0
        for word in tweet:
            if len(word)>1 and word[0]!='#':
                continue
            if word in hashtagscore.keys():
                val = hashtagscore[word]
            val1+=val
        
        vector.append(val1)
    return np.array(vector)


"""##No of Emojis"""

def no_of_emojis(listoftweets):
    with open(base + 'Lex/AffinnEmoticons.txt', 'r',encoding='UTF-8') as document:
        emoticons_score1={}
        #print(document)
        for line in document:
            #print(line)
            words = line.split()
            emoticons_score1[words[0]]=int(words[1])
    emocount=[]
    for tweet in listoftweets:
        emo=0
        t=tweet.split(' ')
        
        for word in t:
            if word in emoticons_score1.keys():
                emo+=1
                #print(word)
        emocount.append(emo)
    return np.array(emocount)

"""##Emoji Sentiment Average"""

def emoticons_score(listoftweets):
    with open(base + 'Lex/AffinnEmoticons.txt', 'r',encoding='UTF-8') as document:
        emoticons_score1={}
        #print(document)
        for line in document:
            #print(line)
            words = line.split()
            emoticons_score1[words[0]]=int(words[1])
    vector=[]
    for tweet in listoftweets:
        tweet=tweet.split(' ')
        emoscore=0
        for word in tweet:
            if word in emoticons_score1.keys():
                emoscore+=emoticons_score1[word]
        #print(emoscore)
        #feature_vector[i][17]+=emoscore 
        vector.append(emoscore)
    return np.array(vector)

def getfeaturearray(listoftweets):

    v1=no_of_URLs(listoftweets)
    v2=no_of_hashtags(listoftweets)
    v3=aggregatepolarityscores_hashtags(listoftweets)
    v4=no_of_emojis(listoftweets)
    v5=emoticons_score(listoftweets)

    return np.concatenate((v1.reshape(-1,1),v2.reshape(-1,1),v3.reshape(-1,1),v4.reshape(-1,1),v5.reshape(-1,1)),axis=1).reshape((1,-1))

In [75]:
y_dict = {'agree' : 0, 'disagree' : 1, 'discuss' : 2, 'unrelated' :3}

def create_embeddings(emdeb, train_body_filename, train_stance_filename) :
    df1 = pd.read_csv(train_body_filename)
    df2 = pd.read_csv(train_stance_filename)
    X = np.zeros((df2.shape[0], 1*(512+12)))
    y = np.zeros(df2.shape[0])
    i = 0
    for index in df1.index :
        a = df1.loc[df1['Body ID'] == index][['articleBody']].values
        if a.shape == (0,1) :
            continue
        print(index, end=' ')
        for k in df2.loc[df2['Body ID'] == index].values :
#             print(k[0] + a[0][0])
            full = k[0] + a[0][0]
            e = emdeb([full]).numpy().reshape((1,-1))
            sh = get_shaney_features(full)
            sr = getfeaturearray([full])
            X[i] = np.concatenate((e, sr, sh), axis=1)
            y[i] = y_dict[k[-1]]
            i += 1
    return X[:i], y[:i]

In [53]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

INFO:absl:Using C:\Users\meet1\AppData\Local\Temp\tfhub_modules to cache modules.


In [39]:
# Using SentenceBERT LaBSE

train_X, train_y = create_embeddings(sbert_model.encode, '../train_bodies.txt', '../train_stances.txt')
np.savetxt('train_instance_bert.csv', np.concatenate((train_X, train_y.reshape((-1,1))), axis=1), delimiter=',')
test_X, test_y = create_embeddings(sbert_model.encode, '../competition_test_bodies.txt', '../competition_test_stances.txt')
np.savetxt('test_instance_bert.csv', np.concatenate((test_X, test_y.reshape((-1,1))), axis=1), delimiter=',')

In [78]:
# Using Universal Encoder

train_X, train_y = create_embeddings(embed, '../train_bodies.txt', '../train_stances.txt')
np.savetxt('train_instance_u.csv', np.concatenate((train_X, train_y.reshape((-1,1))), axis=1), delimiter=',')
test_X, test_y = create_embeddings(embed, '../competition_test_bodies.txt', '../competition_test_stances.txt')
np.savetxt('test_instance_u.csv', np.concatenate((test_X, test_y.reshape((-1,1))), axis=1), delimiter=',')