In [1]:
import os
import time
import datetime
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer, TFBertForSequenceClassification,AutoConfig
import tensorflow as tf

In [2]:
tf.keras.backend.clear_session()

In [3]:
# Set seed value
seed_value = 56
os.environ['PYTHONHASHSEED']=str(0)
# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)
# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)
# 5. Configure a new global `tensorflow` session
from keras import backend as K
# session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
# K.set_session(sess)
# for later versions:
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)




In [4]:
current_path = os.getcwd()

test_data_path = os.path.join(current_path, 'test_data.txt')
row_dev_posts_path = os.path.join(current_path, 'posts.txt')
row_test_posts_path = os.path.join(current_path, 'posts_groundtruth.txt')
train_ekphrasis_path = os.path.join(current_path, 'train_ekphrasis.csv')
dev_ekphrasis_path = os.path.join(current_path, 'dev_ekphrasis.csv')
test_ekphrasis_path = os.path.join(current_path, 'test_ekphrasis.csv')

processed_train_ekphrasis_path = os.path.join(current_path, 'pro_train_ekphrasis.csv')
processed_dev_ekphrasis_path = os.path.join(current_path, 'pro_dev_ekphrasis.csv')
processed_test_ekphrasis_path = os.path.join(current_path, 'pro_test_ekphrasis.csv')

Read Data

In [6]:
df_train = pd.read_csv(processed_train_ekphrasis_path)
print(df_train.shape)
print(df_train.info())

display(df_train.head())

(14066, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14066 entries, 0 to 14065
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_id       14066 non-null  int64 
 1   post_text     14066 non-null  object
 2   user_id       14066 non-null  int64 
 3   image_id(s)   14066 non-null  object
 4   username      14066 non-null  object
 5   timestamp     14066 non-null  object
 6   label         14066 non-null  object
 7   cleaned_text  14066 non-null  object
 8   enc_label     14066 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 989.1+ KB
None


Unnamed: 0,post_id,post_text,user_id,image_id(s),username,timestamp,label,cleaned_text,enc_label
0,263528431088721921,Hurricane SANDY-CAT . 🐱 Meow ~ http://t.co/dj7...,204337678,sandy_fake_57,ImObiWanKenobi,Wed Oct 31 06:30:48 +0000 2012,fake,Hurricane SANDY - CAT . 🐱 Meow ~ <url>,1
1,263493828349145088,Dicen que despues de la tormenta llega la calm...,619474112,sandy_real_114,1UniikeDiiva,Wed Oct 31 04:13:18 +0000 2012,real,They say that after the storm comes the calm !...,0
2,263060464571912192,a SHARK swam up to brigantine.. \n#hurricanesa...,19117875,sandy_fake_26,aimee_sh,Mon Oct 29 23:31:16 +0000 2012,fake,a SHARK swam up to brigantine . . #hurricanesa...,1
3,262961670551400448,Tomb of the Unknown Soldier during Sandy. #muc...,67930605,sandy_fake_63,thuntley11,Mon Oct 29 16:58:42 +0000 2012,fake,Tomb of the Unknown Soldier during Sandy . #mu...,1
4,263086042549194752,Manhattan holy shit #hurricanesandy http://t.c...,263461477,sandy_real_105,CatieBoel,Tue Oct 30 01:12:55 +0000 2012,real,Manhattan holy shit #hurricanesandy <url>,0


In [7]:
df_val = pd.read_csv(processed_dev_ekphrasis_path)
print(df_val.shape)
print(df_val.info())

display(df_val.head())

(1563, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1563 entries, 0 to 1562
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_id       1563 non-null   int64 
 1   post_text     1563 non-null   object
 2   user_id       1563 non-null   int64 
 3   image_id(s)   1563 non-null   object
 4   username      1563 non-null   object
 5   timestamp     1563 non-null   object
 6   label         1563 non-null   object
 7   cleaned_text  1563 non-null   object
 8   enc_label     1563 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 110.0+ KB
None


Unnamed: 0,post_id,post_text,user_id,image_id(s),username,timestamp,label,cleaned_text,enc_label
0,263021327630340096,#sandy http://t.co/Ub6QRUcl,271599347,sandy_fake_39,OscarKennish,Mon Oct 29 20:55:45 +0000 2012,fake,#sandy <url>,1
1,263062792662286337,Hayyyy nanita!!!! EN VIVO #SANDY / ESTATUA DE ...,174872551,sandy_fake_21,Betodiaz79,Mon Oct 29 23:40:31 +0000 2012,fake,Hayyyy nanita ! ! ! ! LIVE #SANDY / STATUE OF ...,1
2,264289224314994688,So shines a good deed in a naughty world #Sand...,10836092,sandy_real_71,joehas,Fri Nov 02 08:53:56 +0000 2012,real,So shines a good deed in a naughty world #Sand...,0
3,263015954211016705,Tomb of the Unknown Soldier continues to stay ...,562635567,sandy_fake_16,bbarnessoccer95,Mon Oct 29 20:34:24 +0000 2012,fake,Tomb of the Unknown Soldier continues to stay ...,1
4,532632521523998720,Young boy demonstrates heroic virtue.--SYRIAN ...,795540624,syrianboy_1,Deborah41006526,Wed Nov 12 20:34:23 +0000 2014,fake,Young boy demonstrates heroic virtue . - - SYR...,1


In [8]:
df_test = pd.read_csv(processed_test_ekphrasis_path)
print(df_test.shape)
print(df_test.info())

display(df_test.head())

(2177, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2177 entries, 0 to 2176
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   post_id       2177 non-null   int64  
 1   post_text     2177 non-null   object 
 2   user_id       2177 non-null   float64
 3   username      2177 non-null   object 
 4   image_id      2177 non-null   object 
 5   timestamp     2177 non-null   object 
 6   label         2177 non-null   object 
 7   cleaned_text  2177 non-null   object 
 8   enc_label     2177 non-null   int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 153.2+ KB
None


Unnamed: 0,post_id,post_text,user_id,username,image_id,timestamp,label,cleaned_text,enc_label
0,651118294447951872,"#AntiTerror ""#ArabianBlood"" #RussianFederation...",383409700.0,AlexArtAndros,airstrikes_1,Mon Oct 05 19:34:33 +0000 2015,fake,"#AntiTerror "" #ArabianBlood "" #RussianFederati...",1
1,651115824065830912,"#АнтиТеррор ""#АрабскаяКровь"" #РФ в #Сирии прот...",383409700.0,AlexArtAndros,airstrikes_1,Mon Oct 05 19:24:44 +0000 2015,fake,"#Anti-Terror "" #ArabBlood "" #RF in #Syria agai...",1
2,651095856662360065,"Ну чё, сегодня надавали трендюлей игиловцам по...",2712310000.0,NataYaraya,airstrikes_1,Mon Oct 05 18:05:23 +0000 2015,fake,"Well , today they gave tryndula to the Igilovi...",1
3,651086828234104832,Действия России в Сирии безукоризненны. Видео...,36905720.0,Alltecz,airstrikes_1,Mon Oct 05 17:29:31 +0000 2015,fake,Russia ' s actions in Syria are irreproachable...,1
4,651034616007106560,5-10-2015\nThe airstrike against an ISIS ammun...,1070959000.0,msojormsojor,airstrikes_1,Mon Oct 05 14:02:02 +0000 2015,fake,5-10-2015 The airstrike against an ISIS ammuni...,1


In [9]:
# Get the lists of sentences and their labels.
train_sent      = df_train.cleaned_text.values
train_labels    = df_train.enc_label.values
val_sent        = df_val.cleaned_text.values
val_labels      = df_val.enc_label.values 
test_sent        = df_test.cleaned_text.values
test_labels      = df_test.enc_label.values 

In [10]:
#Bertweet tokens
import re

for i in range(train_sent.shape[0]):
  train_sent[i] = re.sub(r'<url>','HTTPURL',train_sent[i])
  train_sent[i] = re.sub(r'<user>','@USER',train_sent[i])
for i in range(val_sent.shape[0]):
  val_sent[i] = re.sub(r'<url>','HTTPURL',val_sent[i])
  val_sent[i] = re.sub(r'<user>','@USER',val_sent[i])
for i in range(test_sent.shape[0]):
  test_sent[i] = re.sub(r'<url>','HTTPURL',test_sent[i])
  test_sent[i] = re.sub(r'<user>','@USER',test_sent[i])

In [11]:
train_sent = np.append(train_sent,val_sent)
train_labels = np.append(train_labels,val_labels)