In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

## Reading Dataset:

In [2]:
tweets_tr = pd.read_csv('data/train/tweets.queries.tsv', sep='\t', header=0, index_col=0)
tweets_tr.sort_index(inplace=True)

tweets_te = pd.read_csv('data/dev/tweets.queries.tsv', sep='\t', header=0, index_col=0)
tweets_te.sort_index(inplace=True)

In [3]:
tweets_tr

Unnamed: 0,tweet_content
1,Trump needs to immediately divest from his bus...
2,A number of fraudulent text messages informing...
3,Fact check: The U.S. Army is NOT contacting an...
4,The US drone attack on #Soleimani caught on ca...
5,1. To the dim witted reporters like @dmedin11:...
...,...
997,@HillaryClinton Wikileaks exposed the fact tha...
998,WIKILEAKS BOMBSHELL: Hillary Clinton Smeared P...
999,Clown Mask To Be Banned In USA; Wearing One Co...
1001,"The snowfall this year is supposed to be ""reco..."


In [4]:
vclaims = pd.read_csv('data/verified_claims.docs.tsv', sep='\t', header=0, index_col=0)
vclaims.sort_index(inplace=True)

In [5]:
vclaims

Unnamed: 0,vclaim,title
0,122 detainees released from confinement at Gua...,Did 122 Prisoners Released from Guantanamo by ...
1,"A ""Trump and Obama by the Numbers"" meme recoun...",Does This Meme Accurately Show ‘Trump and Obam...
2,"A ""large-scale killing"" of white farmers is ta...",Is a ‘Large-Scale Killing’ of White Farmers Un...
3,"A ""law to separate families"" was enacted prior...",Was the ‘Law to Separate Families’ Passed in 1...
4,"A ""newly uncovered"" photograph reveals Alexand...",Does an Image Show Ocasio-Cortez Fake-Crying a...
...,...,...
779,WikiLeaks was caught by Newsweek fabricating e...,Newsweek Proves That WikiLeaks Is Leaking Phon...
780,"Wikileaks released a trove of ""deep state file...",Did WikiLeaks Release a Trove of ‘Deep State F...
781,Wombats are herding animals and inviting them ...,Are Wombats Inviting Animals Into Their Burrow...
782,YETI told the NRA the brand no longer wishes t...,Did YETI Brand Coolers Cut Ties with the NRA?


In [6]:
qrels_tr = pd.read_csv('data/train/tweet-vclaim-pairs.qrels', sep='\t', 
                       header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])

qrels_te = pd.read_csv('data/dev/tweet-vclaim-pairs.qrels', sep='\t', 
                       header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])

In [7]:
qrels_tr

Unnamed: 0,tweet_id,0,vclaim_id,relevance
0,1,0,395,1
1,2,0,669,1
2,3,0,669,1
3,4,0,144,1
4,5,0,84,1
...,...,...,...,...
799,997,0,197,1
800,998,0,462,1
801,999,0,260,1
802,1001,0,207,1


## Pre-processing Raw Text:

In [8]:
stopwords = stopwords.words('english')
url_pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
username_pattern = r"@[^\s]+"
hashtag_pattern = r"\B#\w\w+"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"

In [9]:
# function of preprocessing tweet
def preprocess_tweet(tweet, 
                     url_pattern=url_pattern, username_pattern=username_pattern, 
                     hashtag_pattern=hashtag_pattern, token_pattern=token_pattern, 
                     remove_url=True, remove_username=True, remove_hashtag=True,
                     stopwords=stopwords, with_stopwordsrm=True, with_stemming=True):
    # remove content after '—'
    tweet = tweet.split('—')[0]
    
    # remove url
    if remove_url == True:
        tweet = re.sub(url_pattern, "", tweet)
        
    # remove @username 
    if remove_username == True:
        tweet = re.sub(username_pattern, "", tweet)
        
    # remove #hashtag
    if remove_hashtag == True:
        tweet = re.sub(hashtag_pattern, "", tweet)
    
    # lower case 
    tweet_lower = tweet.lower()
    
    # tokenization 
    words = re.findall(token_pattern, tweet_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    tweet_processed = " ".join(words)
    
    return tweet_processed


In [10]:
# dict tweets_tr: tweet_id -> tweet_content
tweets_tr_prep = {}
for tweet_id in tweets_tr.index:
    tweets_tr_prep[tweet_id] = preprocess_tweet(tweets_tr.loc[tweet_id, 'tweet_content'])

# dict tweets_te: tweet_id -> tweet_content
tweets_te_prep = {}
for tweet_id in tweets_te.index:
    tweets_te_prep[tweet_id] = preprocess_tweet(tweets_te.loc[tweet_id, 'tweet_content'])

In [11]:
tweets_tr_prep[106]

'biden grand wizard kkk play lie use vote creator kkk oppos civil right black yup democrat parti'

In [12]:
# function of preprocessing vclaim
def preprocess_text(text, token_pattern=token_pattern, stopwords=stopwords, 
                      with_stopwordsrm=True, with_stemming=True):
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed

In [13]:
# dict vclaim_prep: vlciam_id -> [vlciam_content, vclaim_title]
vclaims_prep = {}
for vclaim_id in vclaims.index:
    vclaims_prep[vclaim_id] = []
    vclaims_prep[vclaim_id].append(preprocess_text(vclaims.loc[vclaim_id, 'vclaim']))
    vclaims_prep[vclaim_id].append(preprocess_text(vclaims.loc[vclaim_id, 'title']))

## Computing Cosine Similarity:

In [14]:
# function of computing consine similarity 
def compute_cs(tweets, vclaims):
    vectorizer = TfidfVectorizer()
    vclaims_tfidf = vectorizer.fit_transform(vclaims)
    
    cosine_sims = {}
    for (tweet_id, tweet_content) in tweets.items():
        tweet_tfidf = vectorizer.transform([tweet_content])
        cs = cosine_similarity(tweet_tfidf, vclaims_tfidf).flatten()
        cosine_sims[tweet_id] = cs
    
    return cosine_sims


In [15]:
vclaims_contents = [vclaim[0] for vclaim in vclaims_prep.values()]
vclaims_titles = [vclaim[1] for vclaim in vclaims_prep.values()]

cs_tvc_tr = compute_cs(tweets_tr_prep, vclaims_contents)
cs_tvt_tr = compute_cs(tweets_tr_prep, vclaims_titles)

cs_tvc_te = compute_cs(tweets_te_prep, vclaims_contents)
cs_tvt_te = compute_cs(tweets_te_prep, vclaims_titles)

In [16]:
df_cs_tvc_tr = pd.DataFrame.from_dict(cs_tvc_tr, orient='index')
df_cs_tvt_tr = pd.DataFrame.from_dict(cs_tvt_tr, orient='index')

df_cs_tvc_te = pd.DataFrame.from_dict(cs_tvc_te, orient='index')
df_cs_tvt_te = pd.DataFrame.from_dict(cs_tvt_te, orient='index')

## Computing BM25 Score:

In [17]:
# function of computing BM25 score
def compute_bm25(tokenized_tweets, tokenized_vclaims):
    bm25kapi = BM25Okapi(tokenized_vclaims)
    
    bm25s = {}
    for (tweet_id, tweet_content) in tokenized_tweets.items():
        bm25s[tweet_id] = bm25kapi.get_scores(tweet_content)
        
    return bm25s


In [18]:
tokenized_vclaims_contents = [vclaim[0].split(" ") for vclaim in vclaims_prep.values()]
tokenized_vclaims_titles = [vclaim[1].split(" ") for vclaim in vclaims_prep.values()]

tokenized_tweets_tr = {}
for (tweet_id, tweet_content) in tweets_tr_prep.items():
    tokenized_tweets_tr[tweet_id] = tweet_content.split(" ")
    
tokenized_tweets_te = {}
for (tweet_id, tweet_content) in tweets_te_prep.items():
    tokenized_tweets_te[tweet_id] = tweet_content.split(" ")

In [19]:
bm25_tvc_tr = compute_bm25(tokenized_tweets_tr, tokenized_vclaims_contents)
bm25_tvt_tr = compute_bm25(tokenized_tweets_tr, tokenized_vclaims_titles)

bm25_tvc_te = compute_bm25(tokenized_tweets_te, tokenized_vclaims_contents)
bm25_tvt_te = compute_bm25(tokenized_tweets_te, tokenized_vclaims_titles)

In [20]:
df_bm25_tvc_tr = pd.DataFrame.from_dict(bm25_tvc_tr, orient='index')
df_bm25_tvt_tr = pd.DataFrame.from_dict(bm25_tvt_tr, orient='index')

df_bm25_tvc_te = pd.DataFrame.from_dict(bm25_tvc_te, orient='index')
df_bm25_tvt_te = pd.DataFrame.from_dict(bm25_tvt_te, orient='index')

## construct dataset with 4 features and labels:

In [21]:
data_tr_tids = []
for tweet_id in tweets_tr_prep.keys():
    data_tr_tids.extend((np.ones(vclaims.shape[0]) * tweet_id).tolist())
    
data_tr_vids = []
for i in range(tweets_tr.shape[0]):
    data_tr_vids.extend(list(vclaims_prep.keys()))
    
data_tr_labels = []
for tweet_id in tweets_tr_prep.keys():
    labels = np.zeros(vclaims.shape[0])
    if tweet_id in qrels_tr['tweet_id'].values:
        for index in qrels_tr[qrels_tr['tweet_id'] == tweet_id]['vclaim_id'].values:
            labels[index] = 1
    data_tr_labels.extend(labels)

In [22]:
data_tr = pd.DataFrame(columns=['tid', 'vid', 'cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt', 'label'])
data_tr['tid'] = data_tr_tids
data_tr['vid'] = data_tr_vids
data_tr['cs_tvc'] = df_cs_tvc_tr.values.flatten()
data_tr['cs_tvt'] = df_cs_tvt_tr.values.flatten()
data_tr['bm25_tvc'] = df_bm25_tvc_tr.values.flatten()
data_tr['bm25_tvt'] = df_bm25_tvt_tr.values.flatten()
data_tr['label'] = data_tr_labels

In [23]:
data_te_tids = []
for tweet_id in tweets_te_prep.keys():
    data_te_tids.extend((np.ones(vclaims.shape[0]) * tweet_id).tolist())
    
data_te_vids = []
for i in range(tweets_te.shape[0]):
    data_te_vids.extend(list(vclaims_prep.keys()))
    
data_te_labels = []
for tweet_id in tweets_te_prep.keys():
    labels = np.zeros(vclaims.shape[0])
    if tweet_id in qrels_te['tweet_id'].values:
        for index in qrels_te[qrels_te['tweet_id'] == tweet_id]['vclaim_id'].values:
            labels[index] = 1
    data_te_labels.extend(labels)

In [24]:
data_te = pd.DataFrame(columns=['tid', 'vid', 'cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt', 'label'])
data_te['tid'] = data_te_tids
data_te['vid'] = data_te_vids
data_te['cs_tvc'] = df_cs_tvc_te.values.flatten()
data_te['cs_tvt'] = df_cs_tvt_te.values.flatten()
data_te['bm25_tvc'] = df_bm25_tvc_te.values.flatten()
data_te['bm25_tvt'] = df_bm25_tvt_te.values.flatten()
data_te['label'] = data_te_labels

## Linear SVC:

In [25]:
X_tr, y_tr = data_tr[['cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt']].values, data_tr['label'].values
X_te, y_te = data_te[['cs_tvc', 'cs_tvt', 'bm25_tvc', 'bm25_tvt']].values, data_te['label'].values

In [26]:
class_weight = {}
for label in set(y_te):
    class_weight[label] = np.sum(y_te == label)

In [27]:
from sklearn.svm import LinearSVC

clf = LinearSVC(C=0.1, random_state=0, class_weight=class_weight, max_iter=20000).fit(X_tr, y_tr)



In [28]:
score = np.sum(X_te * clf.coef_, axis=1) + clf.intercept_

In [29]:
result_score = pd.DataFrame(columns=['tid','vid', 'score'])
result_score['tid'] = data_te['tid']
result_score['vid'] = data_te['vid']
result_score['score'] = score

In [30]:
result = pd.DataFrame(columns=['tweet_id', 'Q0', 'vclaim_id', 'rank', 'score', 'tag'])

tid = list(result_score.groupby(by='tid').groups.keys())
for i in tid:
    idx = result_score[result_score['tid'] == i]['score'].idxmax()
    inf = result_score.iloc[idx]
    result = result.append({'tweet_id':int(inf[0]), 'Q0':'Q0', 'vclaim_id':int(inf[1]),
                            'rank':1, 'score':inf[2], 'tag': 'COVID-19'}, ignore_index=True)

In [32]:
result

Unnamed: 0,tweet_id,Q0,vclaim_id,rank,score,tag
0,0,Q0,456,1,-1.057749,COVID-19
1,12,Q0,640,1,-0.410604,COVID-19
2,23,Q0,109,1,-0.788070,COVID-19
3,30,Q0,223,1,-0.542566,COVID-19
4,33,Q0,171,1,-0.944743,COVID-19
...,...,...,...,...,...,...
195,971,Q0,38,1,-0.179441,COVID-19
196,983,Q0,439,1,-0.215035,COVID-19
197,984,Q0,439,1,-0.018836,COVID-19
198,989,Q0,668,1,-1.131253,COVID-19


In [31]:
result.to_csv('golf_system_result_0.csv', header=False, index=False, sep='\t')

In [None]:
# python3 evaluate.py -s golf_system_result_0.csv -g data/dev/tweet-vclaim-pairs.qrels