## task2:

In [1]:
import pandas as pd
# use in both train and dev.
vclaims = pd.read_csv('data/verified_claims.docs.tsv', sep='\t').rename(columns = {"Unnamed: 0": "id"})
# training dataset.
tweets = pd.read_csv('data/train/tweets.queries.tsv', sep='\t').rename(columns = {"Unnamed: 0": "id"})
tweet_vclaim = pd.read_csv('data/train/tweet-vclaim-pairs.qrels', sep='\t', header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])
# development dataset.
dev_tweets = pd.read_csv('data/dev/tweets.queries.tsv', sep='\t').rename(columns = {"Unnamed: 0": "id"})
dev_tweet_vclaim = pd.read_csv('data/dev/tweet-vclaim-pairs.qrels', sep='\t', header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])

In [2]:
vclaims[:10]

Unnamed: 0,id,vclaim,title
0,0,122 detainees released from confinement at Gua...,Did 122 Prisoners Released from Guantanamo by ...
1,1,"A ""Trump and Obama by the Numbers"" meme recoun...",Does This Meme Accurately Show ‘Trump and Obam...
2,2,"A ""large-scale killing"" of white farmers is ta...",Is a ‘Large-Scale Killing’ of White Farmers Un...
3,3,"A ""law to separate families"" was enacted prior...",Was the ‘Law to Separate Families’ Passed in 1...
4,4,"A ""newly uncovered"" photograph reveals Alexand...",Does an Image Show Ocasio-Cortez Fake-Crying a...
5,5,A 13 percent increase in police-recorded crime...,Is an Increase in Recorded Crime in England an...
6,6,"A 13-year-old girl was beheaded by an ""illegal...",Police: 13-Year-Old Girl Was Beheaded After Se...
7,7,A Broward County employee witnessed elections ...,Did a Broward County Employee Witness Election...
8,8,A California couple gave their newborn child a...,Did a California Newborn Become the First Chil...
9,9,A Cleveland-area Fox affiliate described local...,Did a Cleveland Fox Affiliate Label a Black Mu...


In [3]:
dev_tweets[:10]

Unnamed: 0,id,tweet_content
0,454,Pence’s Brother sells engines to Russia. Manaf...
1,276,Billions of dollars are sent to the State of C...
2,849,I am being proven right about massive vaccinat...
3,789,Little Girl To Trump: You´re A Disgrace To The...
4,295,A knife hidden in a baguette is the most Frenc...
5,661,#MANDALAY BAY SHOOTER IDENTIFIED AS 32 YEAR OL...
6,870,Tweeting again: wh aide confirms the MLK bust ...
7,234,WATCH: President Trump met with loud boos as h...
8,557,"“These are not hoax devices,” FBI Director Chr..."
9,67,At the opening of the South Korean baseball ch...


In [4]:
dev_tweet_vclaim[:10]

Unnamed: 0,tweet_id,0,vclaim_id,relevance
0,0,0,781,1
1,12,0,640,1
2,23,0,259,1
3,30,0,223,1
4,33,0,171,1
5,36,0,238,1
6,39,0,727,1
7,49,0,253,1
8,51,0,644,1
9,62,0,734,1


### following data_preprocessing.ipynb

In [5]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import sys

In [6]:
stopwords = stopwords.words('english')
url_pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
username_pattern = r"@[^\s]+"
hashtag_pattern = r"\B#\w\w+"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"

In [7]:
# function of preprocessing tweet
def preprocess_tweet(tweet, 
                     url_pattern=url_pattern, username_pattern=username_pattern, 
                     hashtag_pattern=hashtag_pattern, token_pattern=token_pattern, 
                     remove_url=True, remove_username=True, remove_hashtag=True,
                     stopwords=stopwords, with_stopwordsrm=True, with_stemming=True):
    # remove content after '—'
    tweet = tweet.split('—')[0]
    
    # remove url
    if remove_url == True:
        tweet = re.sub(url_pattern, "", tweet)
        
    # remove @username 
    if remove_username == True:
        tweet = re.sub(username_pattern, "", tweet)
        
    # remove #hashtag
    if remove_hashtag == True:
        tweet = re.sub(hashtag_pattern, "", tweet)
    
    # lower case 
    tweet_lower = tweet.lower()
    
    # tokenization 
    words = re.findall(token_pattern, tweet_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    tweet_processed = " ".join(words)
    
    return tweet_processed

In [8]:
# dict tweets_prep: tweet_id -> tweet_content
tweets_prep = {}

for i, tid in (tweets.id).items():
    tweets_prep[tid] = preprocess_tweet(tweets.loc[i, 'tweet_content'])
    

dev_tweets_prep = {}

for i, tid in (dev_tweets.id).items():
    dev_tweets_prep[tid] = preprocess_tweet(tweets.loc[i, 'tweet_content'])

In [9]:
print(len(tweets_prep))

print(len(dev_tweets_prep))

803
200


In [10]:
# function of preprocessing vclaim
def preprocess_text(text, token_pattern=token_pattern, stopwords=stopwords, 
                      with_stopwordsrm=True, with_stemming=True):
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed

In [11]:
# dict vclaim_prep: vlciam_id -> [vlciam_content, vclaim_title]
vclaim_prep = {}
for i, vid in (vclaims.id).items():
    vclaim_prep[vid] = []
    vclaim_prep[vid].append(preprocess_text(vclaims.loc[i, 'vclaim']))
    vclaim_prep[vid].append(preprocess_text(vclaims.loc[i, 'title']))

In [12]:
len(vclaim_prep)

784

### using word embedding

In [13]:
from gensim.models import KeyedVectors
# need to download GoogleNews-vectors-negative300.bin. 
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [14]:
# doc vector of training set.
tweets_vec = {}

for tid, text in tweets_prep.items():
    train_text = [model[x] for x in text.split(' ') if x in model]
    if len(train_text) == 0: continue
    vector = np.array(train_text).mean(axis=0)
    tweets_vec[tid] = vector
    
vclaim_vec = {}

for vid, text in vclaim_prep.items():
    train_text = [model[x] for x in text[0].split(' ') if x in model]
    train_title = [model[x] for x in text[1].split(' ') if x in model]
    if len(train_title) == 0 or len(train_text) == 0: continue
    vector_text = np.array(train_text).mean(axis=0)
    vector_title = np.array(train_title).mean(axis=0)
    vclaim_vec[vid] = [vector_text, vector_title]

In [15]:
# doc vector of development dataset.
dev_tweets_vec = {}

for tid, text in dev_tweets_prep.items():
    train_text = [model[x] for x in text.split(' ') if x in model]
    if len(train_text) == 0: continue
    vector = np.array(train_text).mean(axis=0)
    dev_tweets_vec[tid] = vector  

In [16]:
# some tweets whose len is zero has been removed.
print(len(tweets_vec))
print(len(vclaim_vec))
print(len(dev_tweets_vec))

800
783
200


In [17]:
def rand_vid(vid):
    cond = True
    while cond:
        rand_vid = random.randint(0, len(vclaim_vec))
        if (rand_vid != vid) and (rand_vid in vclaim_vec):
            cond = False
    return rand_vid

### construct training dataset with 4 features:

In [18]:
import torch
import random

t_v = dict(zip(list(tweet_vclaim['tweet_id']), list(tweet_vclaim['vclaim_id'])))
tr_X = pd.DataFrame(columns=['tid','vid','cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t', 'label'])

for tid, tweet in tweets_vec.items():
    if tid not in t_v: continue
    arr = [t_v[tid], rand_vid(t_v[tid]), rand_vid(t_v[tid])] # make dataset more balance.
    for vid in arr:
        claim = vclaim_vec[vid]
        x = torch.from_numpy(tweet)
        y1, y2 = torch.from_numpy(claim[0]), torch.from_numpy(claim[1])
        s1, s2 = torch.cosine_similarity(x, y1, dim=0).item(), torch.cosine_similarity(x, y2, dim=0).item()
        e1, e2 = torch.dist(x, y1, p=2).item(), torch.dist(x, y2, p=2).item()
        label = 1 if t_v[tid] == vid else 0
        tr_X=tr_X.append({'tid':tid, 'vid':vid, 'cos_c':s1, 'Euclidean_c':e1, 'cos_t':s2, 'Euclidean_t':e2, 'label':label},ignore_index=True)


In [19]:
tr_X[:10]

Unnamed: 0,tid,vid,cos_c,Euclidean_c,cos_t,Euclidean_t,label
0,106.0,58.0,0.713943,0.839961,0.740772,0.942683,1.0
1,106.0,214.0,0.548175,1.174231,0.324234,1.801337,0.0
2,106.0,513.0,0.489609,1.024165,0.403861,1.227743,0.0
3,274.0,350.0,0.847545,0.701128,0.676729,0.930777,1.0
4,274.0,271.0,0.409497,1.318618,0.448777,1.273453,0.0
5,274.0,205.0,0.358569,1.337553,0.226823,1.964043,0.0
6,871.0,290.0,0.672887,0.876105,0.66931,0.948188,1.0
7,871.0,143.0,0.465234,1.207725,0.451669,1.262996,0.0
8,871.0,332.0,0.488354,1.101877,0.507434,1.384898,0.0
9,876.0,579.0,0.615748,1.064329,0.533191,1.670064,1.0


### construct dev dataset:

In [20]:
dev_t_v = dict(zip(list(dev_tweet_vclaim['tweet_id']), list(dev_tweet_vclaim['vclaim_id'])))
dev_X = pd.DataFrame(columns=['tid','vid','cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t', 'label'])

for tid, tweet in dev_tweets_vec.items():
    if tid not in dev_t_v: continue
    for vid, claim in vclaim_vec.items():
        x = torch.from_numpy(tweet)
        y1, y2 = torch.from_numpy(claim[0]), torch.from_numpy(claim[1])
        s1, s2 = torch.cosine_similarity(x, y1, dim=0).item(), torch.cosine_similarity(x, y2, dim=0).item()
        e1, e2 = torch.dist(x, y1, p=2).item(), torch.dist(x, y2, p=2).item()
        label = 1 if dev_t_v[tid] == vid else 0
        dev_X=dev_X.append({'tid':tid, 'vid':vid, 'cos_c':s1, 'Euclidean_c':e1, 'cos_t':s2, 'Euclidean_t':e2, 'label':label},ignore_index=True)

In [21]:
dev_X[:10] 

Unnamed: 0,tid,vid,cos_c,Euclidean_c,cos_t,Euclidean_t,label
0,454.0,0.0,0.576904,1.084875,0.557494,1.147098,0.0
1,454.0,1.0,0.66409,1.10999,0.584389,1.405756,0.0
2,454.0,2.0,0.548836,1.077028,0.502754,1.195149,0.0
3,454.0,3.0,0.516021,1.145715,0.613488,1.20985,0.0
4,454.0,4.0,0.394544,1.322593,0.416105,1.289802,0.0
5,454.0,5.0,0.554041,1.164412,0.566732,1.284574,0.0
6,454.0,6.0,0.35875,1.391273,0.379697,1.472143,0.0
7,454.0,7.0,0.415876,1.391316,0.415876,1.391316,0.0
8,454.0,8.0,0.468508,1.25786,0.467076,1.244315,0.0
9,454.0,9.0,0.60062,1.088224,0.539521,1.262613,0.0


### LR classifier:

In [22]:
X_tr, y_tr = tr_X[['cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t']].values, tr_X['label'].values

X_dev, y_dev = dev_X[['cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t']].values, dev_X['label'].values

In [23]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs',class_weight={0:0.1, 1:0.9}).fit(X_tr, y_tr)

In [24]:
prob = clf.predict_proba(X_dev)[:, 1]

In [25]:
dev_tweet_vclaim.loc[0, 'tweet_id']

result = pd.DataFrame(columns=['tid','vid', 'prob'])
result['tid'] = dev_X['tid']
result['vid'] = dev_X['vid']
result['prob'] = prob

In [26]:
result[:10]

Unnamed: 0,tid,vid,prob
0,454.0,0.0,0.853368
1,454.0,1.0,0.925609
2,454.0,2.0,0.800305
3,454.0,3.0,0.857619
4,454.0,4.0,0.552265
5,454.0,5.0,0.856948
6,454.0,6.0,0.49565
7,454.0,7.0,0.591994
8,454.0,8.0,0.691299
9,454.0,9.0,0.867164


In [27]:
result_new = pd.DataFrame(columns=['tweet_id', 'Q0', 'vclaim_id', 'rank', 'score', 'tag'])

In [28]:
# according to tweet id, finding vclaim id with highest probability.

tid = list(result.groupby(by='tid').groups.keys())
for i in tid:
    idx = result[result['tid'] == i]['prob'].idxmax()
    inf = result.iloc[idx]
    result_new = result_new.append({'tweet_id':inf[0], 'Q0': 0, 'vclaim_id': inf[1], 'rank': 1, 'score': inf[2], 'tag': 'COVID-19'},ignore_index=True)
     

In [29]:
result_new[:10]

Unnamed: 0,tweet_id,Q0,vclaim_id,rank,score,tag
0,0.0,0,642.0,1,0.937844,COVID-19
1,12.0,0,642.0,1,0.965666,COVID-19
2,23.0,0,642.0,1,0.914748,COVID-19
3,30.0,0,71.0,1,0.893767,COVID-19
4,33.0,0,207.0,1,0.947455,COVID-19
5,36.0,0,25.0,1,0.950482,COVID-19
6,39.0,0,626.0,1,0.81445,COVID-19
7,49.0,0,642.0,1,0.872749,COVID-19
8,51.0,0,169.0,1,0.854087,COVID-19
9,62.0,0,360.0,1,0.961052,COVID-19


In [30]:
result_new.to_csv('golf_system_result_0.csv', header=True)