In [105]:
# !pip install transformers

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras


#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

<Figure size 432x288 with 0 Axes>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
df_train = pd.read_csv('drive/MyDrive/Dataset/StanceDataset/train.csv', encoding='latin-1', engine='python')
df_train

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos
...,...,...,...,...,...
2909,"There's a law protecting unborn eagles, but no...",Legalization of Abortion,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
2910,I am 1 in 3... I have had an abortion #Abortio...,Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,other
2911,How dare you say my sexual preference is a cho...,Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,neg
2912,"Equal rights for those 'born that way', no rig...",Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,neg


In [83]:
df_test = pd.read_csv('drive/MyDrive/Dataset/StanceDataset/test.csv', encoding='latin-1', engine='python')
df_test.head()

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,He who exalts himself shall be humbled; a...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
1,RT @prayerbullets: I remove Nehushtan -previou...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,other
2,@Brainman365 @heidtjj @BenjaminLives I have so...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
3,#God is utterly powerless without Human interv...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@David_Cameron Miracles of #Multiculturalism...,Atheism,AGAINST,2. The tweet does NOT expresses opinion about ...,neg


In [84]:
df_test['Target'].value_counts()

Donald Trump                        707
Hillary Clinton                     295
Feminist Movement                   285
Legalization of Abortion            280
Atheism                             220
Climate Change is a Real Concern    169
Name: Target, dtype: int64

In [85]:
df_test = df_test.loc[df_test['Target'].isin(['Donald Trump'])]
# , 'Hillary Clinton'

In [86]:
# df_test = df_test.loc[df_test['Target'] == 'Hillary Clinton']

In [87]:
df_test.reset_index(drop=True, inplace=True)

In [88]:
df_test

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,@2014voteblue @ChrisJZullo blindly supporting ...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,neg
1,@ThePimpernelX @Cameron_Gray @CalebHowe Total...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,pos
2,@JeffYoung @ThePatriot143 I fully support full...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,pos
3,@ABC Stupid is as stupid does! Showedhis true ...,Donald Trump,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@HouseGOP we now have one political party. The...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,neg
...,...,...,...,...,...
702,@realDonaldTrump we all want you as the next p...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
703,@RSherman_25 Join Twitter Trump brigade #oneth...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
704,@JoeyBats19 Join Twitter Trump brigade #onetho...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
705,Trump's outlandish statements is political str...,Donald Trump,AGAINST,1. The tweet explicitly expresses opinion abo...,neg


In [89]:
##CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS

#Clean emojis from text
def strip_emoji(text):
    # return re.sub(emoji.emoji_list(), r"", text) #remove emoji
    # rem = clean(text, no_emoji=True)
    # return rem
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [90]:
texts_new_test = []
for t in df_test['Tweet']:
    texts_new_test.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(strip_emoji(t))))))

In [91]:
df_test['clean_text'] = texts_new_test

In [8]:
!pip install emoji==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [92]:
from transformers import AutoModel, AutoTokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Choose GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Select mode path here
# pretrained_LM_path = "kornosk/polibertweet-mlm"
pretrained_LM_path = "kornosk/bert-election2020-twitter-stance-trump-KE-MLM"

# Load model
tokenizer = AutoTokenizer.from_pretrained(pretrained_LM_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_LM_path)

In [94]:
id2label = {
    0: "AGAINST",
    1: "FAVOR",
    2: "NONE"
}

In [95]:
df_test['encoded_tweet'] = df_test['clean_text'].apply(lambda func: tokenizer(func, return_tensors='pt'))

In [96]:
df_test['predicted'] = ''

In [97]:
from scipy.special import softmax

In [99]:
for idx, _ in enumerate(df_test['encoded_tweet']):
  enc = df_test['encoded_tweet'][idx]
  outputs = model(**enc)
  predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()
  lbl = id2label[np.argmax(predicted_probability)]
  df_test['predicted'].iloc[idx] = lbl

In [100]:
df_test

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment,clean_text,encoded_tweet,predicted
0,@2014voteblue @ChrisJZullo blindly supporting ...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,neg,blindly supporting political parties is willfu...,"[input_ids, token_type_ids, attention_mask]",NONE
1,@ThePimpernelX @Cameron_Gray @CalebHowe Total...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,pos,totally agree time to get out of the twitter p...,"[input_ids, token_type_ids, attention_mask]",NONE
2,@JeffYoung @ThePatriot143 I fully support full...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,pos,i fully support full repeal i will work for ca...,"[input_ids, token_type_ids, attention_mask]",NONE
3,@ABC Stupid is as stupid does! Showedhis true ...,Donald Trump,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,stupid is as stupid does showedhis true colors...,"[input_ids, token_type_ids, attention_mask]",NONE
4,@HouseGOP we now have one political party. The...,Donald Trump,NONE,2. The tweet does NOT expresses opinion about ...,neg,we now have one political party the socialist ...,"[input_ids, token_type_ids, attention_mask]",NONE
...,...,...,...,...,...,...,...,...
702,@realDonaldTrump we all want you as the next p...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos,we all want you as the next president you tell...,"[input_ids, token_type_ids, attention_mask]",AGAINST
703,@RSherman_25 Join Twitter Trump brigade #oneth...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos,join twitter trump brigade onethousandtweets t...,"[input_ids, token_type_ids, attention_mask]",NONE
704,@JoeyBats19 Join Twitter Trump brigade #onetho...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos,join twitter trump brigade onethousandtweets t...,"[input_ids, token_type_ids, attention_mask]",NONE
705,Trump's outlandish statements is political str...,Donald Trump,AGAINST,1. The tweet explicitly expresses opinion abo...,neg,trumps outlandish statements is political stra...,"[input_ids, token_type_ids, attention_mask]",AGAINST


In [101]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(df_test['Stance'], df_test['predicted'])
print(acc)

0.43564356435643564
