In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

## Reading Dataset:

In [2]:
tweets_tr = pd.read_csv('data/train/tweets.queries.tsv', sep='\t', header=0, index_col=0)
tweets_tr.sort_index(inplace=True)

tweets_dev = pd.read_csv('data/dev/tweets.queries.tsv', sep='\t', header=0, index_col=0)
tweets_dev.sort_index(inplace=True)

tweets_te = pd.read_csv('data/test_tweets.queries.tsv', sep='\t', header=0, index_col=0)
tweets_te.sort_index(inplace=True)

In [3]:
tweets_tr

Unnamed: 0,tweet_content
1,Trump needs to immediately divest from his bus...
2,A number of fraudulent text messages informing...
3,Fact check: The U.S. Army is NOT contacting an...
4,The US drone attack on #Soleimani caught on ca...
5,1. To the dim witted reporters like @dmedin11:...
...,...
994,Giants stars dare NFL to fine them for cleats ...
995,Fury over NFL's crackdown on player's 9/11 tri...
996,During a recent interview with Australian jour...
997,#BreakingNews: We’re launching an exciting new...


In [4]:
vclaims = pd.read_csv('data/verified_claims.docs.tsv', sep='\t', header=0, index_col=0)
vclaims.sort_index(inplace=True)

In [5]:
vclaims

Unnamed: 0,vclaim,title
0,122 detainees released from confinement at Gua...,Did 122 Prisoners Released from Guantanamo by ...
1,70 per cent of the persons arrested during pro...,70% of Arrested Charlotte Protesters Are Out-o...
2,"A ""Trump and Obama by the Numbers"" meme recoun...",Does This Meme Accurately Show ‘Trump and Obam...
3,"A ""large-scale killing"" of white farmers is ta...",Is a ‘Large-Scale Killing’ of White Farmers Un...
4,"A ""law to separate families"" was enacted prior...",Was the ‘Law to Separate Families’ Passed in 1...
...,...,...
10370,"“Slime,” a do-it-yourself gooey craft project ...",Does the “Slime” Craze Bring Serious Health Ri...
10371,“Sun tea” (tea brewed by being left to steep i...,Bacteria in Sun Tea Risk
10372,"“The Real Deal,” words of wisdom about gas, ge...",Red Thomas ‘Real Deal’ Letter
10373,“Valentine’s Day” worm.,Valentine’s Day Worm


In [6]:
qrels_tr = pd.read_csv('data/train/tweet-vclaim-pairs.qrels', sep='\t', 
                       header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])

qrels_dev = pd.read_csv('data/dev/tweet-vclaim-pairs.qrels', sep='\t', 
                       header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])

In [7]:
qrels_tr

Unnamed: 0,tweet_id,0,vclaim_id,relevance
0,1,0,394,1
1,2,0,670,1
2,3,0,670,1
3,4,0,141,1
4,5,0,83,1
...,...,...,...,...
796,994,0,652,1
797,995,0,652,1
798,996,0,778,1
799,997,0,579,1


## Pre-processing Raw Text:

In [8]:
stopwords = stopwords.words('english')
url_pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
username_pattern = r"@[^\s]+"
hashtag_pattern = r"\B#\w\w+"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"

In [9]:
# function of preprocessing tweet
def preprocess_tweet(tweet, 
                     url_pattern=url_pattern, username_pattern=username_pattern, 
                     hashtag_pattern=hashtag_pattern, token_pattern=token_pattern, 
                     remove_url=True, remove_username=True, remove_hashtag=True,
                     stopwords=stopwords, with_stopwordsrm=True, with_stemming=True):
    # remove content after '—'
    tweet = tweet.split('—')[0]
    
    # remove url
    if remove_url == True:
        tweet = re.sub(url_pattern, "", tweet)
        
    # remove @username 
    if remove_username == True:
        tweet = re.sub(username_pattern, "", tweet)
        
    # remove #hashtag
    if remove_hashtag == True:
        tweet = re.sub(hashtag_pattern, "", tweet)
    
    # lower case 
    tweet_lower = tweet.lower()
    
    # tokenization 
    words = re.findall(token_pattern, tweet_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    tweet_processed = " ".join(words)
    
    return tweet_processed


In [10]:
# dict tweets_tr: tweet_id -> tweet_content
tweets_tr_prep = {}
for tweet_id in tweets_tr.index:
    tweets_tr_prep[tweet_id] = preprocess_tweet(tweets_tr.loc[tweet_id, 'tweet_content'])

# dict tweets_dev: tweet_id -> tweet_content
tweets_dev_prep = {}
for tweet_id in tweets_dev.index:
    tweets_dev_prep[tweet_id] = preprocess_tweet(tweets_dev.loc[tweet_id, 'tweet_content'])
    
# dict tweets_te: tweet_id -> tweet_content
tweets_te_prep = {}
for tweet_id in tweets_te.index:
    tweets_te_prep[tweet_id] = preprocess_tweet(tweets_te.loc[tweet_id, 'tweet_content'])

In [11]:
# function of preprocessing vclaim
def preprocess_text(text, token_pattern=token_pattern, stopwords=stopwords, 
                      with_stopwordsrm=True, with_stemming=True):
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed

In [12]:
# dict vclaim_prep: vlciam_id -> [vlciam_content, vclaim_title]
vclaims_prep = {}
for vclaim_id in vclaims.index:
    vclaims_prep[vclaim_id] = []
    vclaims_prep[vclaim_id].append(preprocess_text(vclaims.loc[vclaim_id, 'vclaim']))
    vclaims_prep[vclaim_id].append(preprocess_text(vclaims.loc[vclaim_id, 'title']))

## Computing Cosine Similarity:

In [13]:
# function of computing consine similarity 
def compute_cs(tweets, vclaims):
    vectorizer = TfidfVectorizer()
    vclaims_tfidf = vectorizer.fit_transform(vclaims)
    
    cosine_sims = {}
    for (tweet_id, tweet_content) in tweets.items():
        tweet_tfidf = vectorizer.transform([tweet_content])
        cs = cosine_similarity(tweet_tfidf, vclaims_tfidf).flatten()
        cosine_sims[tweet_id] = cs
    
    return cosine_sims


In [14]:
# computing Cosine Similarities for train and dev set
vclaims_contents = [vclaim[0] for vclaim in vclaims_prep.values()]
vclaims_titles = [vclaim[1] for vclaim in vclaims_prep.values()]

cs_tvc_tr = compute_cs(tweets_tr_prep, vclaims_contents)
cs_tvt_tr = compute_cs(tweets_tr_prep, vclaims_titles)

cs_tvc_dev = compute_cs(tweets_dev_prep, vclaims_contents)
cs_tvt_dev = compute_cs(tweets_dev_prep, vclaims_titles)

cs_tvc_te = compute_cs(tweets_te_prep, vclaims_contents)
cs_tvt_te = compute_cs(tweets_te_prep, vclaims_titles)

In [15]:
df_cs_tvc_tr = pd.DataFrame.from_dict(cs_tvc_tr, orient='index')
df_cs_tvt_tr = pd.DataFrame.from_dict(cs_tvt_tr, orient='index')

df_cs_tvc_dev = pd.DataFrame.from_dict(cs_tvc_dev, orient='index')
df_cs_tvt_dev = pd.DataFrame.from_dict(cs_tvt_dev, orient='index')

df_cs_tvc_te = pd.DataFrame.from_dict(cs_tvc_te, orient='index')
df_cs_tvt_te = pd.DataFrame.from_dict(cs_tvt_te, orient='index')

## Computing BM25 Score:

In [16]:
# function of computing BM25 score
def compute_bm25(tokenized_tweets, tokenized_vclaims):
    bm25kapi = BM25Okapi(tokenized_vclaims)
    
    bm25s = {}
    for (tweet_id, tweet_content) in tokenized_tweets.items():
        bm25s[tweet_id] = bm25kapi.get_scores(tweet_content)
        
    return bm25s


In [17]:
# tokenizing pre-processed text
tokenized_vclaims_contents = [vclaim[0].split(" ") for vclaim in vclaims_prep.values()]
tokenized_vclaims_titles = [vclaim[1].split(" ") for vclaim in vclaims_prep.values()]

tokenized_tweets_tr = {}
for (tweet_id, tweet_content) in tweets_tr_prep.items():
    tokenized_tweets_tr[tweet_id] = tweet_content.split(" ")
    
tokenized_tweets_dev = {}
for (tweet_id, tweet_content) in tweets_dev_prep.items():
    tokenized_tweets_dev[tweet_id] = tweet_content.split(" ")
    
tokenized_tweets_te = {}
for (tweet_id, tweet_content) in tweets_te_prep.items():
    tokenized_tweets_te[tweet_id] = tweet_content.split(" ")

In [18]:
# computing BM25 Scores for train and test set
bm25_tvc_tr = compute_bm25(tokenized_tweets_tr, tokenized_vclaims_contents)
bm25_tvt_tr = compute_bm25(tokenized_tweets_tr, tokenized_vclaims_titles)

bm25_tvc_dev = compute_bm25(tokenized_tweets_dev, tokenized_vclaims_contents)
bm25_tvt_dev = compute_bm25(tokenized_tweets_dev, tokenized_vclaims_titles)

bm25_tvc_te = compute_bm25(tokenized_tweets_te, tokenized_vclaims_contents)
bm25_tvt_te = compute_bm25(tokenized_tweets_te, tokenized_vclaims_titles)

In [19]:
df_bm25_tvc_tr = pd.DataFrame.from_dict(bm25_tvc_tr, orient='index')
df_bm25_tvt_tr = pd.DataFrame.from_dict(bm25_tvt_tr, orient='index')

df_bm25_tvc_dev = pd.DataFrame.from_dict(bm25_tvc_dev, orient='index')
df_bm25_tvt_dev = pd.DataFrame.from_dict(bm25_tvt_dev, orient='index')

df_bm25_tvc_te = pd.DataFrame.from_dict(bm25_tvc_te, orient='index')
df_bm25_tvt_te = pd.DataFrame.from_dict(bm25_tvt_te, orient='index')

## Constructing dataset with 4 features and labels:

In [20]:
data_tr_tids = []
for tweet_id in tweets_tr_prep.keys():
    data_tr_tids.extend((np.ones(vclaims.shape[0]) * tweet_id).tolist())
    
data_tr_vids = []
for i in range(tweets_tr.shape[0]):
    data_tr_vids.extend(list(vclaims_prep.keys()))
    
data_tr_labels = []
for tweet_id in tweets_tr_prep.keys():
    labels = np.zeros(vclaims.shape[0])
    if tweet_id in qrels_tr['tweet_id'].values:
        for index in qrels_tr[qrels_tr['tweet_id'] == tweet_id]['vclaim_id'].values:
            labels[index] = 1
    data_tr_labels.extend(labels)
    
data_tr = pd.DataFrame(columns=['tid', 'vid', 'cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt', 'label'])
data_tr['tid'] = data_tr_tids
data_tr['vid'] = data_tr_vids
data_tr['cs_tvc'] = df_cs_tvc_tr.values.flatten()
data_tr['cs_tvt'] = df_cs_tvt_tr.values.flatten()
data_tr['bm25_tvc'] = df_bm25_tvc_tr.values.flatten()
data_tr['bm25_tvt'] = df_bm25_tvt_tr.values.flatten()
data_tr['label'] = data_tr_labels

In [21]:
data_dev_tids = []
for tweet_id in tweets_dev_prep.keys():
    data_dev_tids.extend((np.ones(vclaims.shape[0]) * tweet_id).tolist())
    
data_dev_vids = []
for i in range(tweets_dev.shape[0]):
    data_dev_vids.extend(list(vclaims_prep.keys()))
    
data_dev_labels = []
for tweet_id in tweets_dev_prep.keys():
    labels = np.zeros(vclaims.shape[0])
    if tweet_id in qrels_dev['tweet_id'].values:
        for index in qrels_dev[qrels_dev['tweet_id'] == tweet_id]['vclaim_id'].values:
            labels[index] = 1
    data_dev_labels.extend(labels)
    
data_dev = pd.DataFrame(columns=['tid', 'vid', 'cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt', 'label'])
data_dev['tid'] = data_dev_tids
data_dev['vid'] = data_dev_vids
data_dev['cs_tvc'] = df_cs_tvc_dev.values.flatten()
data_dev['cs_tvt'] = df_cs_tvt_dev.values.flatten()
data_dev['bm25_tvc'] = df_bm25_tvc_dev.values.flatten()
data_dev['bm25_tvt'] = df_bm25_tvt_dev.values.flatten()
data_dev['label'] = data_dev_labels

In [22]:
data_te_tids = []
for tweet_id in tweets_te_prep.keys():
    data_te_tids.extend((np.ones(vclaims.shape[0]) * tweet_id).tolist())
    
data_te_vids = []
for i in range(tweets_te.shape[0]):
    data_te_vids.extend(list(vclaims_prep.keys()))
    
data_te = pd.DataFrame(columns=['tid', 'vid', 'cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt', 'label'])
data_te['tid'] = data_te_tids
data_te['vid'] = data_te_vids
data_te['cs_tvc'] = df_cs_tvc_te.values.flatten()
data_te['cs_tvt'] = df_cs_tvt_te.values.flatten()
data_te['bm25_tvc'] = df_bm25_tvc_te.values.flatten()
data_te['bm25_tvt'] = df_bm25_tvt_te.values.flatten()

## Linear SVC:

In [23]:
X_tr, y_tr = data_tr[['cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt']].values, data_tr['label'].values
X_dev, y_dev = data_dev[['cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt']].values, data_dev['label'].values
X_te = data_te[['cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt']].values

In [24]:
# computing class weights to handle data imbalance
class_weight = {}
for label in set(y_dev):
    class_weight[label] = np.sum(y_dev == label)

In [25]:
from sklearn.svm import LinearSVC

clf = LinearSVC(C=0.1, random_state=0, class_weight=class_weight, max_iter=1000).fit(X_tr, y_tr)



In [26]:
# generating result for dev data set
score_dev = np.sum(X_dev * clf.coef_, axis=1) + clf.intercept_

result_dev_score = pd.DataFrame(columns=['tid','vid', 'score'])
result_dev_score['tid'] = data_dev['tid']
result_dev_score['vid'] = data_dev['vid']
result_dev_score['score'] = score_dev

result_dev = pd.DataFrame(columns=['tweet_id', 'Q0', 'vclaim_id', 'rank', 'score', 'tag'])

tid = list(result_dev_score.groupby(by='tid').groups.keys())
for i in tid:
    idx = result_dev_score[result_dev_score['tid'] == i]['score'].idxmax()
    inf = result_dev_score.iloc[idx]
    result_dev = result_dev.append({'tweet_id':int(inf[0]), 'Q0':'Q0', 'vclaim_id':int(inf[1]), 
                                    'rank':1, 'score':inf[2], 'tag': 'COVID-19'}, ignore_index=True)

In [27]:
result_dev

Unnamed: 0,tweet_id,Q0,vclaim_id,rank,score,tag
0,0,Q0,2194,1,-0.949281,COVID-19
1,11,Q0,639,1,-0.897546,COVID-19
2,21,Q0,4318,1,-0.955595,COVID-19
3,28,Q0,219,1,-0.941585,COVID-19
4,31,Q0,1275,1,-0.965971,COVID-19
...,...,...,...,...,...,...
192,960,Q0,178,1,-0.845738,COVID-19
193,966,Q0,355,1,-0.809903,COVID-19
194,968,Q0,524,1,-0.877747,COVID-19
195,982,Q0,249,1,-0.957366,COVID-19


In [28]:
result_dev.to_csv('dev_set_results/golf_system_result_dev_1.csv', header=False, index=False, sep='\t')

In [29]:
# generating result for test data set
score_te = np.sum(X_te * clf.coef_, axis=1) + clf.intercept_

result_te_score = pd.DataFrame(columns=['tid','vid', 'score'])
result_te_score['tid'] = data_te['tid']
result_te_score['vid'] = data_te['vid']
result_te_score['score'] = score_te

result_te = pd.DataFrame(columns=['tweet_id', 'Q0', 'vclaim_id', 'rank', 'score', 'tag'])

tid = list(result_te_score.groupby(by='tid').groups.keys())
for i in tid:
    idx = result_te_score[result_te_score['tid'] == i]['score'].idxmax()
    inf = result_te_score.iloc[idx]
    result_te = result_te.append({'tweet_id':int(inf[0]), 'Q0':'Q0', 'vclaim_id':int(inf[1]), 
                                    'rank':1, 'score':inf[2], 'tag': 'COVID-19'}, ignore_index=True)

In [30]:
result_te

Unnamed: 0,tweet_id,Q0,vclaim_id,rank,score,tag
0,999,Q0,6094,1,-0.884481,COVID-19
1,1000,Q0,6094,1,-0.908192,COVID-19
2,1001,Q0,582,1,-0.915467,COVID-19
3,1002,Q0,8005,1,-0.911853,COVID-19
4,1003,Q0,8005,1,-0.914699,COVID-19
...,...,...,...,...,...,...
195,1194,Q0,9974,1,-0.978785,COVID-19
196,1195,Q0,6672,1,-0.980529,COVID-19
197,1196,Q0,8855,1,-0.967455,COVID-19
198,1197,Q0,5553,1,-0.897356,COVID-19


In [31]:
result_te.to_csv('test_set_results/golf_system_result_te_1.csv', header=False, index=False, sep='\t')

In [32]:
# python3 evaluate.py -s golf_system_result_dev_1.csv -g data/dev/tweet-vclaim-pairs.qrels