## task2:

In [1]:
import pandas as pd
# use in both train and dev.
vclaims = pd.read_csv('data/verified_claims.docs.tsv', sep='\t').rename(columns = {"Unnamed: 0": "id"})
# training dataset.
tweets = pd.read_csv('data/train/tweets.queries.tsv', sep='\t').rename(columns = {"Unnamed: 0": "id"})
tweet_vclaim = pd.read_csv('data/train/tweet-vclaim-pairs.qrels', sep='\t', header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])
# development dataset.
dev_tweets = pd.read_csv('data/dev/tweets.queries.tsv', sep='\t').rename(columns = {"Unnamed: 0": "id"})
dev_tweet_vclaim = pd.read_csv('data/dev/tweet-vclaim-pairs.qrels', sep='\t', header=None, names=['tweet_id', '0', 'vclaim_id', 'relevance'])

In [2]:
vclaims[:10]

Unnamed: 0,id,vclaim,title
0,0,122 detainees released from confinement at Gua...,Did 122 Prisoners Released from Guantanamo by ...
1,1,"A ""Trump and Obama by the Numbers"" meme recoun...",Does This Meme Accurately Show ‘Trump and Obam...
2,2,"A ""large-scale killing"" of white farmers is ta...",Is a ‘Large-Scale Killing’ of White Farmers Un...
3,3,"A ""law to separate families"" was enacted prior...",Was the ‘Law to Separate Families’ Passed in 1...
4,4,"A ""newly uncovered"" photograph reveals Alexand...",Does an Image Show Ocasio-Cortez Fake-Crying a...
5,5,A 13 percent increase in police-recorded crime...,Is an Increase in Recorded Crime in England an...
6,6,"A 13-year-old girl was beheaded by an ""illegal...",Police: 13-Year-Old Girl Was Beheaded After Se...
7,7,A Broward County employee witnessed elections ...,Did a Broward County Employee Witness Election...
8,8,A California couple gave their newborn child a...,Did a California Newborn Become the First Chil...
9,9,A Cleveland-area Fox affiliate described local...,Did a Cleveland Fox Affiliate Label a Black Mu...


In [3]:
dev_tweets[:10]

Unnamed: 0,id,tweet_content
0,454,Pence’s Brother sells engines to Russia. Manaf...
1,276,Billions of dollars are sent to the State of C...
2,849,I am being proven right about massive vaccinat...
3,789,Little Girl To Trump: You´re A Disgrace To The...
4,295,A knife hidden in a baguette is the most Frenc...
5,661,#MANDALAY BAY SHOOTER IDENTIFIED AS 32 YEAR OL...
6,870,Tweeting again: wh aide confirms the MLK bust ...
7,234,WATCH: President Trump met with loud boos as h...
8,557,"“These are not hoax devices,” FBI Director Chr..."
9,67,At the opening of the South Korean baseball ch...


In [4]:
dev_tweet_vclaim[:10]

print(len(dev_tweet_vclaim))
print(len(tweet_vclaim))

199
804


### following data_preprocessing.ipynb

In [5]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import sys

In [6]:
stopwords = stopwords.words('english')
url_pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
username_pattern = r"@[^\s]+"
hashtag_pattern = r"\B#\w\w+"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"

In [7]:
# function of preprocessing tweet
def preprocess_tweet(tweet, 
                     url_pattern=url_pattern, username_pattern=username_pattern, 
                     hashtag_pattern=hashtag_pattern, token_pattern=token_pattern, 
                     remove_url=True, remove_username=True, remove_hashtag=True,
                     stopwords=stopwords, with_stopwordsrm=True, with_stemming=True):
    # remove content after '—'
    tweet = tweet.split('—')[0]
    
    # remove url
    if remove_url == True:
        tweet = re.sub(url_pattern, "", tweet)
        
    # remove @username 
    if remove_username == True:
        tweet = re.sub(username_pattern, "", tweet)
        
    # remove #hashtag
    if remove_hashtag == True:
        tweet = re.sub(hashtag_pattern, "", tweet)
    
    # lower case 
    tweet_lower = tweet.lower()
    
    # tokenization 
    words = re.findall(token_pattern, tweet_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    tweet_processed = " ".join(words)
    
    return tweet_processed

In [8]:
# dict tweets_prep: tweet_id -> tweet_content
tweets_prep = {}

for i, tid in (tweets.id).items():
    tweets_prep[tid] = preprocess_tweet(tweets.loc[i, 'tweet_content'])
    

dev_tweets_prep = {}

for i, tid in (dev_tweets.id).items():
    dev_tweets_prep[tid] = preprocess_tweet(tweets.loc[i, 'tweet_content'])

In [9]:
print(len(tweets_prep))

print(len(dev_tweets_prep))

803
200


In [10]:
# function of preprocessing vclaim
def preprocess_text(text, token_pattern=token_pattern, stopwords=stopwords, 
                      with_stopwordsrm=True, with_stemming=True):
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed

In [11]:
# dict vclaim_prep: vlciam_id -> [vlciam_content, vclaim_title]
vclaim_prep = {}
for i, vid in (vclaims.id).items():
    vclaim_prep[vid] = []
    vclaim_prep[vid].append(preprocess_text(vclaims.loc[i, 'vclaim']))
    vclaim_prep[vid].append(preprocess_text(vclaims.loc[i, 'title']))

In [12]:
len(vclaim_prep)

784

### using word embedding

In [13]:
from gensim.models import KeyedVectors
# need to download GoogleNews-vectors-negative300.bin. 
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [14]:
# doc vector of training set.
tweets_vec = {}

for tid, text in tweets_prep.items():
    train_text = [model[x] for x in text.split(' ') if x in model]
    if len(train_text) == 0: continue
    vector = np.array(train_text).mean(axis=0)
    tweets_vec[tid] = vector
    
vclaim_vec = {}

for vid, text in vclaim_prep.items():
    train_text = [model[x] for x in text[0].split(' ') if x in model]
    train_title = [model[x] for x in text[1].split(' ') if x in model]
    if len(train_title) == 0 or len(train_text) == 0: continue
    vector_text = np.array(train_text).mean(axis=0)
    vector_title = np.array(train_title).mean(axis=0)
    vclaim_vec[vid] = [vector_text, vector_title]

In [15]:
# doc vector of development dataset.
dev_tweets_vec = {}

for tid, text in dev_tweets_prep.items():
    train_text = [model[x] for x in text.split(' ') if x in model]
    if len(train_text) == 0: continue
    vector = np.array(train_text).mean(axis=0)
    dev_tweets_vec[tid] = vector  

In [16]:
# some tweets whose len is zero has been removed.
print(len(tweets_vec))
print(len(vclaim_vec))
print(len(dev_tweets_vec))

800
783
200


### construct training dataset with 4 features:

In [18]:
# import torch
# import random

# t_v = dict(zip(list(tweet_vclaim['tweet_id']), list(tweet_vclaim['vclaim_id'])))
# tr_X = pd.DataFrame(columns=['tid','vid','cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t', 'label'])

# for tid, tweet in tweets_vec.items():
#     if tid not in t_v: continue
#     for vid, claim in vclaim_vec.items():
#         x = torch.from_numpy(tweet)
#         y1, y2 = torch.from_numpy(claim[0]), torch.from_numpy(claim[1])
#         s1, s2 = torch.cosine_similarity(x, y1, dim=0).item(), torch.cosine_similarity(x, y2, dim=0).item()
#         e1, e2 = torch.dist(x, y1, p=2).item(), torch.dist(x, y2, p=2).item()
#         label = 1 if t_v[tid] == vid else 0
#         tr_X=tr_X.append({'tid':tid, 'vid':vid, 'cos_c':s1, 'Euclidean_c':e1, 'cos_t':s2, 'Euclidean_t':e2, 'label':label},ignore_index=True)

# tr_X.to_csv('tr_X_four_features.csv', header=True, index=False)

### construct dev dataset:

In [19]:
# dev_t_v = dict(zip(list(dev_tweet_vclaim['tweet_id']), list(dev_tweet_vclaim['vclaim_id'])))
# dev_X = pd.DataFrame(columns=['tid','vid','cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t', 'label'])

# for tid, tweet in dev_tweets_vec.items():
#     if tid not in dev_t_v: continue
#     for vid, claim in vclaim_vec.items():
#         x = torch.from_numpy(tweet)
#         y1, y2 = torch.from_numpy(claim[0]), torch.from_numpy(claim[1])
#         s1, s2 = torch.cosine_similarity(x, y1, dim=0).item(), torch.cosine_similarity(x, y2, dim=0).item()
#         e1, e2 = torch.dist(x, y1, p=2).item(), torch.dist(x, y2, p=2).item()
#         label = 1 if dev_t_v[tid] == vid else 0
#         dev_X=dev_X.append({'tid':tid, 'vid':vid, 'cos_c':s1, 'Euclidean_c':e1, 'cos_t':s2, 'Euclidean_t':e2, 'label':label},ignore_index=True)
        

# dev_X.to_csv('dev_X_four_features.csv', header=True, index=False)

In [20]:
raw_tr_X = pd.read_csv('tr_X_four_features.csv')
raw_dev_X = pd.read_csv('dev_X_four_features.csv')

raw_tr_X[:10]

Unnamed: 0,tid,vid,cos_c,Euclidean_c,cos_t,Euclidean_t,label
0,106.0,0.0,0.576904,1.084875,0.557494,1.147098,0.0
1,106.0,1.0,0.66409,1.10999,0.584389,1.405756,0.0
2,106.0,2.0,0.548836,1.077028,0.502754,1.195149,0.0
3,106.0,3.0,0.516021,1.145715,0.613488,1.20985,0.0
4,106.0,4.0,0.394544,1.322593,0.416105,1.289802,0.0
5,106.0,5.0,0.554041,1.164412,0.566732,1.284574,0.0
6,106.0,6.0,0.35875,1.391273,0.379697,1.472143,0.0
7,106.0,7.0,0.415876,1.391316,0.415876,1.391316,0.0
8,106.0,8.0,0.468508,1.25786,0.467076,1.244315,0.0
9,106.0,9.0,0.60062,1.088224,0.539521,1.262613,0.0


### scoring model

In [21]:
X_tr, y_tr = raw_tr_X[['cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t']].values, raw_tr_X['label'].values

X_dev, y_dev = raw_dev_X[['cos_c', 'Euclidean_c', 'cos_t', 'Euclidean_t']].values, raw_dev_X['label'].values

In [22]:
dev_qrel = pd.read_csv('data/dev/tweet-vclaim-pairs.qrels', names=['tweet_id', '0', 'vclaim_id', 'relevance'], sep='\t')
tr_qrel = pd.read_csv('data/train/tweet-vclaim-pairs.qrels', names=['tweet_id', '0', 'vclaim_id', 'relevance'], sep='\t')

In [23]:
# from sklearn.linear_model import LogisticRegression

# clf = LogisticRegression(random_state=0, solver='lbfgs',class_weight={0:0.0001, 1:0.999}).fit(X_tr, y_tr)

# prob_tr = clf.predict_proba(X_tr)[:, 1]

# score_tr = []
# for v in prob_tr:
#     if v >= 0 and v < 0.2:
#         score_tr.append(0)
#     elif v >= 0.2 and v < 0.4:
#         score_tr.append(1)
#     elif v >= 0.4 and v < 0.6:
#         score_tr.append(2)
#     elif v >= 0.6 and v < 0.8:
#         score_tr.append(3)
#     else:
#         score_tr.append(4)

# raw_tr_X['prob'] = prob_tr
# raw_tr_X['class'] = score_tr

# tr_qrel_dict = dict(zip(list(tr_qrel['tweet_id']), list(tr_qrel['vclaim_id'])))

# for t, v in tr_qrel_dict.items():
#     temp = raw_tr_X[raw_tr_X['tid'] == t]
#     index = temp[temp['vid'] == v].index
#     raw_tr_X.loc[index, 'class'] = 5

In [24]:
# raw_tr_X[:10]

In [26]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs',class_weight={0:0.0001, 1:0.999}).fit(X_tr, y_tr)

score = (X_dev * clf.coef_).sum(axis=1)

In [None]:
# lr_y_tr = raw_tr_X['class'].values

# from sklearn.linear_model import LinearRegression

# regr = LinearRegression().fit(X_tr, lr_y_tr)

# score = (X_dev * regr.coef_).sum(axis=1)

# print(regr.coef_)

In [41]:
from sklearn.svm import LinearSVC

svc = LinearSVC().fit(X_tr, y_tr)

score = (X_dev * svc.coef_[0]).sum(axis=1)

print(svc.coef_)

[[0.67734033 0.04293891 0.65394966 0.04453936]]


In [42]:
raw_dev_X['score'] = score

raw_dev_X[:10]

Unnamed: 0,tid,vid,cos_c,Euclidean_c,cos_t,Euclidean_t,label,score
0,454.0,0.0,0.576904,1.084875,0.557494,1.147098,0.0,0.853008
1,454.0,1.0,0.66409,1.10999,0.584389,1.405756,0.0,0.942249
2,454.0,2.0,0.548836,1.077028,0.502754,1.195149,0.0,0.800002
3,454.0,3.0,0.516021,1.145715,0.613488,1.20985,0.0,0.853794
4,454.0,4.0,0.394544,1.322593,0.416105,1.289802,0.0,0.65359
5,454.0,5.0,0.554041,1.164412,0.566732,1.284574,0.0,0.853101
6,454.0,6.0,0.35875,1.391273,0.379697,1.472143,0.0,0.616607
7,454.0,7.0,0.415876,1.391316,0.415876,1.391316,0.0,0.675362
8,454.0,8.0,0.468508,1.25786,0.467076,1.244315,0.0,0.732216
9,454.0,9.0,0.60062,1.088224,0.539521,1.262613,0.0,0.862607


In [28]:
result = pd.DataFrame()
MAP = 0
for i in range(len(dev_qrel)):
    tid, vid = dev_qrel.loc[i, 'tweet_id'], dev_qrel.loc[i, 'vclaim_id']
    temp = raw_dev_X[raw_dev_X['tid'] == tid].sort_values(by='score' , ascending=False)
    temp.reset_index(drop=True, inplace=True)
    index = temp[temp['vid'] == vid].index
    if len(index) == 0: continue
    MAP += 1/(index[0]+1)
    result = result.append(temp)
    
print("MAP without cutoff: ", MAP/len(dev_qrel))

MAP without cutoff:  0.0123198888095347


In [43]:
result_new = pd.DataFrame(columns=['tweet_id', 'Q0', 'vclaim_id', 'rank', 'score', 'tag'])

tid = list(raw_dev_X.groupby(by='tid').groups.keys())
for i in tid:
    idx = raw_dev_X[raw_dev_X['tid'] == i]['score'].idxmax()
    inf = raw_dev_X.iloc[idx]
    result_new = result_new.append({'tweet_id':int(inf[0]), 'Q0': 'Q0', 'vclaim_id': int(inf[1]), 'rank': 1, 'score': inf[7], 'tag': 'COVID-19'},ignore_index=True)
      

In [44]:
result_new[:10]

Unnamed: 0,tweet_id,Q0,vclaim_id,rank,score,tag
0,0,Q0,507,1,0.930955,COVID-19
1,12,Q0,603,1,1.003638,COVID-19
2,23,Q0,460,1,0.847953,COVID-19
3,30,Q0,71,1,0.906689,COVID-19
4,33,Q0,207,1,0.977914,COVID-19
5,36,Q0,25,1,1.003693,COVID-19
6,39,Q0,626,1,0.816548,COVID-19
7,49,Q0,105,1,0.873709,COVID-19
8,51,Q0,453,1,0.850258,COVID-19
9,62,Q0,518,1,1.024553,COVID-19


In [45]:
result_new.to_csv('dev_set_results/golf_system_result_0.csv', header=False, index=False, sep='\t')

In [40]:
"""
Format check: Passed
          metric @depth  score
             map      1  0.000
             map      3  0.000
             map      5  0.000
             map     10  0.000
             map     20  0.000
             map    all  0.000
       precision      1  0.000
       precision      3  0.000
       precision      5  0.000
       precision     10  0.000
       precision     20  0.000
       precision    all  0.000
 reciprocal_rank      1  0.000
 reciprocal_rank      3  0.000
 reciprocal_rank      5  0.000
 reciprocal_rank     10  0.000
 reciprocal_rank     20  0.000
 reciprocal_rank    all  0.000
"""

'\nFormat check: Passed\n          metric @depth  score\n             map      1  0.000\n             map      3  0.000\n             map      5  0.000\n             map     10  0.000\n             map     20  0.000\n             map    all  0.000\n       precision      1  0.000\n       precision      3  0.000\n       precision      5  0.000\n       precision     10  0.000\n       precision     20  0.000\n       precision    all  0.000\n reciprocal_rank      1  0.000\n reciprocal_rank      3  0.000\n reciprocal_rank      5  0.000\n reciprocal_rank     10  0.000\n reciprocal_rank     20  0.000\n reciprocal_rank    all  0.000\n'

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification

# clf = RandomForestClassifier(max_depth=20, random_state=0, n_estimators=100).fit(X_tr, y_tr)
# prob = clf.predict_proba(X_dev)[:, 1]
# result_new = constructor(dev_tweet_vclaim, dev_X, prob)
# score = scorer(dev_qrel, result_new)
# print("2. Random Forest, MAP: ", score)

# from sklearn.neighbors import KNeighborsClassifier
# neigh = KNeighborsClassifier(n_neighbors=3).fit(X_tr, y_tr)
# prob = neigh.predict_proba(X_dev)[:, 1]
# result_new = constructor(dev_tweet_vclaim, dev_X, prob)
# score = scorer(dev_qrel, result_new)
# print("3. K neighbours, MAP: ", score)

# from sklearn.ensemble import GradientBoostingClassifier

# clf = GradientBoostingClassifier(random_state=0).fit(X_tr, y_tr)
# prob = clf.predict_proba(X_dev)[:, 1]
# result_new = constructor(dev_tweet_vclaim, dev_X, prob)
# score = scorer(dev_qrel, result_new)
# print("4. Gradient Boosting, MAP: ", score)