# 数据预处理

In [2]:
import pandas as pd
import json
pd.set_option('display.max_colwidth', 500)

In [3]:
verified_df = pd.read_table('../data/snopes/verified-claims.tsv')
query_df = pd.read_table(
    '../data/snopes/fact-checking/tweet-veclaim-pairs.tsv')
len(verified_df), len(query_df)

(10381, 999)

In [4]:
verified_df.head()

Unnamed: 0,title,fact
0,#-9-0 Phone Scam,Pressing #-9-0 on your telephone will allow scammers to make long-distance calls and charge them to your phone bill.
1,#Boycott36: Do Clinton and Sanders Support Late-Term Abortions?,"Bernie Sanders and Hillary Clinton both seek an abortion 'cutoff date' of 36 weeks, and both claim that late-stage fetuses feel no pain and have no rights."
2,$1 McDonald's Any Size Fries,McDonald’s restaurants are offering $1 Any Size Fries between May 18 and 31.
3,$1 Real Chicken Deal Boston Market Coupon,Boston Market is offering coupons good for a $1 chicken meal.
4,$10 Bill Printing Error,New U.S. $10 bills released in 2006 contain a printing error.


In [5]:
query_df.head()

Unnamed: 0,claim,fact
0,"How are butterflies surviving the #AustralianFires? Julie Favell was putting out water for wildlife that survived the fires when she witnessed common brown butterflies (Heteronympha merope) fluttering in a moist wombat hole. Footage by Julie Favell pic.twitter.com/0eJtwJyS1J — Center for Bio Div (@CenterForBioDiv) January 14, 2020",Wombats are herding animals and inviting them into their burrows in order to escape the wildfires in Australia.
1,"Trump needs to immediately divest from his businesses and comply with the emoluments clause. Iran could threaten Trump hotels *worldwide* and he could provoke war over the loss of revenue from skittish guests. His business interests should not be driving military decisions. — Ilhan Omar (@IlhanMN) January 6, 2020","In January 2020, U.S. Rep. Ilhan Omar advised Iran to attack Trump-branded hotels in the world, thus committing treason."
2,A number of fraudulent text messages informing individuals they have been selected for a military draft have circulated throughout the country this week.,The U.S. Army is sending text messages informing people they've been selected for the military draft.
3,"Fact check: The U.S. Army is NOT contacting anyone regarding the draft. If you are receiving texts, phone calls or direct messages about a military draft, they are not official communications from the U.S. Army pic.twitter.com/3S32De8ekP — U.S. Army CGSC (@USACGSC) January 8, 2020",The U.S. Army is sending text messages informing people they've been selected for the military draft.
4,"The US drone attack on #Soleimani caught on camera.#IranUsapic.twitter.com/TvRkHvlgby — Olaudah Equiano® (@RealOlaudah) January 6, 2020",A video shows the U.S.-ordered drone strike that killed Iran Gen. Qassem Soleimani.


## 英文分词

In [6]:
import re
import nltk


def del_url_at(text):
    pattern = re.compile(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    urls = re.findall(pattern, text)
    for url in urls:
        text = text.replace(url, '')

    pattern = re.compile(
        '@(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    ats = re.findall(pattern, text)
    for at in ats:
        text = text.replace(at, '')

    text = text.replace('\n', '').replace('\r', '').replace('\t', '')
    return text


def cut_words(text):
    pattern = r"""(?x)                   # set flag to allow verbose regexps 
                  (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
                  |\d+(?:\.\d+)?%?       # numbers, incl. currency and percentages 
                  |\w+(?:[-']\w+)*       # words w/ optional internal hyphens/apostrophe 
                  |\.\.\.                # ellipsis 
                  |(?:[.,;"'?():-_`!])    # special characters with meanings 
                """

    return nltk.regexp_tokenize(del_url_at(text), pattern)

In [7]:
query_claims = query_df['claim'].tolist()
query_claims_cut = [cut_words(t) for t in query_claims]
query_df['claim_words'] = query_claims_cut

In [8]:
query_df.head()

Unnamed: 0,claim,fact,claim_words
0,"How are butterflies surviving the #AustralianFires? Julie Favell was putting out water for wildlife that survived the fires when she witnessed common brown butterflies (Heteronympha merope) fluttering in a moist wombat hole. Footage by Julie Favell pic.twitter.com/0eJtwJyS1J — Center for Bio Div (@CenterForBioDiv) January 14, 2020",Wombats are herding animals and inviting them into their burrows in order to escape the wildfires in Australia.,"[How, are, butterflies, surviving, the, AustralianFires, ?, Julie, Favell, was, putting, out, water, for, wildlife, that, survived, the, fires, when, she, witnessed, common, brown, butterflies, (, Heteronympha, merope, ), fluttering, in, a, moist, wombat, hole, ., Footage, by, Julie, Favell, pic, ., twitter, ., com, 0, eJtwJyS1J, Center, for, Bio, Div, (, January, 14, ,, 2020]"
1,"Trump needs to immediately divest from his businesses and comply with the emoluments clause. Iran could threaten Trump hotels *worldwide* and he could provoke war over the loss of revenue from skittish guests. His business interests should not be driving military decisions. — Ilhan Omar (@IlhanMN) January 6, 2020","In January 2020, U.S. Rep. Ilhan Omar advised Iran to attack Trump-branded hotels in the world, thus committing treason.","[Trump, needs, to, immediately, divest, from, his, businesses, and, comply, with, the, emoluments, clause, ., Iran, could, threaten, Trump, hotels, worldwide, and, he, could, provoke, war, over, the, loss, of, revenue, from, skittish, guests, ., His, business, interests, should, not, be, driving, military, decisions, ., Ilhan, Omar, (, January, 6, ,, 2020]"
2,A number of fraudulent text messages informing individuals they have been selected for a military draft have circulated throughout the country this week.,The U.S. Army is sending text messages informing people they've been selected for the military draft.,"[A, number, of, fraudulent, text, messages, informing, individuals, they, have, been, selected, for, a, military, draft, have, circulated, throughout, the, country, this, week, .]"
3,"Fact check: The U.S. Army is NOT contacting anyone regarding the draft. If you are receiving texts, phone calls or direct messages about a military draft, they are not official communications from the U.S. Army pic.twitter.com/3S32De8ekP — U.S. Army CGSC (@USACGSC) January 8, 2020",The U.S. Army is sending text messages informing people they've been selected for the military draft.,"[Fact, check, :, The, U.S., Army, is, NOT, contacting, anyone, regarding, the, draft, ., If, you, are, receiving, texts, ,, phone, calls, or, direct, messages, about, a, military, draft, ,, they, are, not, official, communications, from, the, U.S., Army, pic, ., twitter, ., com, 3, S32De8ekP, U.S., Army, CGSC, (, January, 8, ,, 2020]"
4,"The US drone attack on #Soleimani caught on camera.#IranUsapic.twitter.com/TvRkHvlgby — Olaudah Equiano® (@RealOlaudah) January 6, 2020",A video shows the U.S.-ordered drone strike that killed Iran Gen. Qassem Soleimani.,"[The, US, drone, attack, on, Soleimani, caught, on, camera, ., IranUsapic, ., twitter, ., com, TvRkHvlgby, Olaudah, Equiano, (, January, 6, ,, 2020]"


In [9]:
verified_titles = verified_df['title'].tolist()
verified_titles_cut = [cut_words(t) for t in verified_titles]
verified_df['title_words'] = verified_titles_cut

verified_facts = verified_df['fact'].tolist()
verified_facts_cut = [cut_words(t) for t in verified_facts]
verified_df['fact_words'] = verified_facts_cut

In [10]:
verified_df.head()

Unnamed: 0,title,fact,title_words,fact_words
0,#-9-0 Phone Scam,Pressing #-9-0 on your telephone will allow scammers to make long-distance calls and charge them to your phone bill.,"[9, 0, Phone, Scam]","[Pressing, 9, 0, on, your, telephone, will, allow, scammers, to, make, long-distance, calls, and, charge, them, to, your, phone, bill, .]"
1,#Boycott36: Do Clinton and Sanders Support Late-Term Abortions?,"Bernie Sanders and Hillary Clinton both seek an abortion 'cutoff date' of 36 weeks, and both claim that late-stage fetuses feel no pain and have no rights.","[Boycott36, :, Do, Clinton, and, Sanders, Support, Late-Term, Abortions, ?]","[Bernie, Sanders, and, Hillary, Clinton, both, seek, an, abortion, ', cutoff, date, ', of, 36, weeks, ,, and, both, claim, that, late-stage, fetuses, feel, no, pain, and, have, no, rights, .]"
2,$1 McDonald's Any Size Fries,McDonald’s restaurants are offering $1 Any Size Fries between May 18 and 31.,"[1, McDonald's, Any, Size, Fries]","[McDonald, s, restaurants, are, offering, 1, Any, Size, Fries, between, May, 18, and, 31, .]"
3,$1 Real Chicken Deal Boston Market Coupon,Boston Market is offering coupons good for a $1 chicken meal.,"[1, Real, Chicken, Deal, Boston, Market, Coupon]","[Boston, Market, is, offering, coupons, good, for, a, 1, chicken, meal, .]"
4,$10 Bill Printing Error,New U.S. $10 bills released in 2006 contain a printing error.,"[10, Bill, Printing, Error]","[New, U.S., 10, bills, released, in, 2006, contain, a, printing, error, .]"


## query-verified 数据对应

In [11]:
pairs_index = []
for fact in query_df['fact'].tolist():
    qdf = verified_df[verified_df['fact'] == fact]
    pairs_index.append(qdf.index.tolist())
    
len(pairs_index)

999

In [12]:
for pairs in pairs_index:
    if len(pairs) != 1:
        print(pairs)

[2740, 3028]
[2740, 3028]
[2740, 3028]


In [13]:
query_df['verified_index'] = pairs_index

In [14]:
query_df.head()

Unnamed: 0,claim,fact,claim_words,verified_index
0,"How are butterflies surviving the #AustralianFires? Julie Favell was putting out water for wildlife that survived the fires when she witnessed common brown butterflies (Heteronympha merope) fluttering in a moist wombat hole. Footage by Julie Favell pic.twitter.com/0eJtwJyS1J — Center for Bio Div (@CenterForBioDiv) January 14, 2020",Wombats are herding animals and inviting them into their burrows in order to escape the wildfires in Australia.,"[How, are, butterflies, surviving, the, AustralianFires, ?, Julie, Favell, was, putting, out, water, for, wildlife, that, survived, the, fires, when, she, witnessed, common, brown, butterflies, (, Heteronympha, merope, ), fluttering, in, a, moist, wombat, hole, ., Footage, by, Julie, Favell, pic, ., twitter, ., com, 0, eJtwJyS1J, Center, for, Bio, Div, (, January, 14, ,, 2020]",[563]
1,"Trump needs to immediately divest from his businesses and comply with the emoluments clause. Iran could threaten Trump hotels *worldwide* and he could provoke war over the loss of revenue from skittish guests. His business interests should not be driving military decisions. — Ilhan Omar (@IlhanMN) January 6, 2020","In January 2020, U.S. Rep. Ilhan Omar advised Iran to attack Trump-branded hotels in the world, thus committing treason.","[Trump, needs, to, immediately, divest, from, his, businesses, and, comply, with, the, emoluments, clause, ., Iran, could, threaten, Trump, hotels, worldwide, and, he, could, provoke, war, over, the, loss, of, revenue, from, skittish, guests, ., His, business, interests, should, not, be, driving, military, decisions, ., Ilhan, Omar, (, January, 6, ,, 2020]",[7346]
2,A number of fraudulent text messages informing individuals they have been selected for a military draft have circulated throughout the country this week.,The U.S. Army is sending text messages informing people they've been selected for the military draft.,"[A, number, of, fraudulent, text, messages, informing, individuals, they, have, been, selected, for, a, military, draft, have, circulated, throughout, the, country, this, week, .]",[6074]
3,"Fact check: The U.S. Army is NOT contacting anyone regarding the draft. If you are receiving texts, phone calls or direct messages about a military draft, they are not official communications from the U.S. Army pic.twitter.com/3S32De8ekP — U.S. Army CGSC (@USACGSC) January 8, 2020",The U.S. Army is sending text messages informing people they've been selected for the military draft.,"[Fact, check, :, The, U.S., Army, is, NOT, contacting, anyone, regarding, the, draft, ., If, you, are, receiving, texts, ,, phone, calls, or, direct, messages, about, a, military, draft, ,, they, are, not, official, communications, from, the, U.S., Army, pic, ., twitter, ., com, 3, S32De8ekP, U.S., Army, CGSC, (, January, 8, ,, 2020]",[6074]
4,"The US drone attack on #Soleimani caught on camera.#IranUsapic.twitter.com/TvRkHvlgby — Olaudah Equiano® (@RealOlaudah) January 6, 2020",A video shows the U.S.-ordered drone strike that killed Iran Gen. Qassem Soleimani.,"[The, US, drone, attack, on, Soleimani, caught, on, camera, ., IranUsapic, ., twitter, ., com, TvRkHvlgby, Olaudah, Equiano, (, January, 6, ,, 2020]",[4142]


## Dataframe -> json

In [21]:
def transfer(df):
    cols = df.columns.tolist()
    
    values = dict()
    for c in cols:
        values[c] = df[c].tolist()
    
    df_list = []
    for i in range(len(values[cols[0]])):
        items = dict()
        for c in cols:
            items[c] = values[c][i]
        df_list.append(items)
    
    return df_list

In [24]:
query_json = transfer(query_df)
verified_json = transfer(verified_df)

len(query_json), len(verified_json)

(999, 10381)

In [26]:
with open('./query_{}.json'.format(len(query_json)), 'w') as f:
    json.dump(query_json, f, indent=4)

with open('./verified_{}.json'.format(len(verified_json)), 'w') as f:
    json.dump(verified_json, f, indent=4)

# BM25 检索

In [1]:
from gensim.summarization import bm25
import numpy as np
import json

In [2]:
with open('./query_999.json', 'r') as f:
    queries = json.load(f)
with open('./verified_10381.json', 'r') as f:
    facts = json.load(f)
    
len(queries), len(facts)

(999, 10381)

In [3]:
# print(queries[100]['claim'], '\n')
# facts[queries[100]['verified_index'][0]]

In [4]:
queries[0].keys(), facts[0].keys()

(dict_keys(['claim', 'fact', 'claim_words', 'verified_index']),
 dict_keys(['title', 'fact', 'title_words', 'fact_words']))

In [5]:
search_words = [q['claim_words'] for q in queries]
len(search_words)

999

In [6]:
' '.join(search_words[0])

'How are butterflies surviving the AustralianFires ? Julie Favell was putting out water for wildlife that survived the fires when she witnessed common brown butterflies ( Heteronympha merope ) fluttering in a moist wombat hole . Footage by Julie Favell pic . twitter . com 0 eJtwJyS1J Center for Bio Div ( January 14 , 2020'

In [7]:
ground_truth = [q['verified_index'] for q in queries]
len(ground_truth)

999

In [8]:
ground_truth[0]

[563]

## Title

In [9]:
facts[0]['title']

'#-9-0 Phone Scam'

In [10]:
corpus = [f['title_words'] for f in facts]
len(corpus)

10381

In [11]:
bm_model = bm25.BM25(corpus)

In [12]:
bm_results = []
for i, search in enumerate(search_words):
    scores = np.array(bm_model.get_scores(search))
    ranks = scores.argsort()[::-1]

    result = []
    for gt in ground_truth[i]:
        result.append(np.where(ranks == gt)[0][0])

    bm_results.append(result)

In [24]:
bm_results[:5]

[[6094], [0], [3782], [46], [245]]

In [27]:
for i, q in enumerate(queries):
    q['bm25_rank_title'] = bm_results[i]

In [29]:
# queries[0]

## VerClaim

In [31]:
facts[0]['fact_words']

['Pressing',
 '9',
 '0',
 'on',
 'your',
 'telephone',
 'will',
 'allow',
 'scammers',
 'to',
 'make',
 'long-distance',
 'calls',
 'and',
 'charge',
 'them',
 'to',
 'your',
 'phone',
 'bill',
 '.']

In [32]:
corpus = [f['fact_words'] for f in facts]
len(corpus)

10381

In [33]:
bm_model = bm25.BM25(corpus)

In [34]:
bm_results = []
for i, search in enumerate(search_words):
    scores = np.array(bm_model.get_scores(search))
    ranks = scores.argsort()[::-1]

    result = []
    for gt in ground_truth[i]:
        result.append(np.where(ranks == gt)[0][0])

    bm_results.append(result)

In [37]:
bm_results[:5]

[[2473], [0], [0], [0], [0]]

In [38]:
for i, q in enumerate(queries):
    q['bm25_rank_verclaim'] = bm_results[i]

In [40]:
# queries[0]

## Title + VerClaim

In [42]:
facts[0]['title_words']+facts[0]['fact_words']

['9',
 '0',
 'Phone',
 'Scam',
 'Pressing',
 '9',
 '0',
 'on',
 'your',
 'telephone',
 'will',
 'allow',
 'scammers',
 'to',
 'make',
 'long-distance',
 'calls',
 'and',
 'charge',
 'them',
 'to',
 'your',
 'phone',
 'bill',
 '.']

In [43]:
corpus = [f['title_words']+f['fact_words'] for f in facts]
len(corpus)

10381

In [44]:
bm_model = bm25.BM25(corpus)

In [45]:
bm_results = []
for i, search in enumerate(search_words):
    scores = np.array(bm_model.get_scores(search))
    ranks = scores.argsort()[::-1]

    result = []
    for gt in ground_truth[i]:
        result.append(np.where(ranks == gt)[0][0])

    bm_results.append(result)

In [46]:
bm_results[:5]

[[4703], [0], [0], [0], [0]]

In [47]:
for i, q in enumerate(queries):
    q['bm25_rank_verclaim+title'] = bm_results[i]

In [52]:
# queries[2]

## 结果导出

In [58]:
for q in queries:
#     del q['claim_words']
    for k in ['bm25_rank_title', 'bm25_rank_verclaim', 'bm25_rank_verclaim+title']:
        q[k] = [int(v) for v in q[k]]

In [59]:
queries[0]

{'claim': 'How are butterflies surviving the #AustralianFires? Julie Favell was putting out water for wildlife that survived the fires when she witnessed common brown butterflies (Heteronympha merope) fluttering in a moist wombat hole. Footage by Julie Favell pic.twitter.com/0eJtwJyS1J — Center for Bio Div (@CenterForBioDiv) January 14, 2020',
 'fact': 'Wombats are herding animals and inviting them into their burrows in order to escape the wildfires in Australia.',
 'verified_index': [563],
 'bm25_rank_title': [6094],
 'bm25_rank_verclaim': [2473],
 'bm25_rank_verclaim+title': [4703]}

In [60]:
with open('./bm25_results.json', 'w') as f:
    json.dump(queries, f, indent=4)

# Evaluation

In [1]:
import numpy as np
import json

In [2]:
with open('./bm25_results.json', 'r') as f:
    results = json.load(f)
    
len(results)

999

In [3]:
results[0]

{'claim': 'How are butterflies surviving the #AustralianFires? Julie Favell was putting out water for wildlife that survived the fires when she witnessed common brown butterflies (Heteronympha merope) fluttering in a moist wombat hole. Footage by Julie Favell pic.twitter.com/0eJtwJyS1J — Center for Bio Div (@CenterForBioDiv) January 14, 2020',
 'fact': 'Wombats are herding animals and inviting them into their burrows in order to escape the wildfires in Australia.',
 'verified_index': [563],
 'bm25_rank_title': [6094],
 'bm25_rank_verclaim': [2473],
 'bm25_rank_verclaim+title': [4703]}

In [4]:
ranks_title = [r['bm25_rank_title'] for r in results]
ranks_verclaim = [r['bm25_rank_verclaim'] for r in results]
ranks_both = [r['bm25_rank_verclaim+title'] for r in results]

len(ranks_title), len(ranks_verclaim), len(ranks_both)

(999, 999, 999)

In [5]:
def MRR(ranks):
    assert type(ranks) == list

    first_ranks = []
    for r in ranks:
        first_ranks.append(min(r))

    mrr = np.mean([1/(r+1) for r in first_ranks])
    return mrr


def MAP(ranks, k):
    assert type(ranks) == list

    APs = []
    for rank_results in ranks:
        AP = 0
        i = 1

        rank_results.sort()
        for r in rank_results:
            if r+1 <= k:
                AP += i / (r + 1)
                i += 1

        AP /= len(rank_results)
        APs.append(AP)

    return np.mean(APs)


def HasPositives(ranks, k):
    assert type(ranks) == list

    Has = []
    for rank_results in ranks:
        rank_results.sort()
        if rank_results[0] + 1 <= k:
            Has.append(1)
        else:
            Has.append(0)

    return np.mean(Has)

In [6]:
print('MRR\n-------------------------')
print('IR-Title:\t{}'.format(MRR(ranks_title)))
print('IR-VerClaim:\t{}'.format(MRR(ranks_verclaim)))
print('IR-VerClaim+Title:\t{}'.format(MRR(ranks_both)))

MRR
-------------------------
IR-Title:	0.20468404563323714
IR-VerClaim:	0.5473237121863617
IR-VerClaim+Title:	0.6197977745066475


In [7]:
for k in [1, 3, 5, 10, 20, 10381]:
    print('\nMAP@{}\n-----------------------'.format(k))
    print('IR-Title:\t{}'.format(MAP(ranks_title, k)))
    print('IR-VerClaim:\t{}'.format(MAP(ranks_verclaim, k)))
    print('IR-VerClaim+Title:\t{}'.format(MAP(ranks_both, k)))


MAP@1
-----------------------
IR-Title:	0.15265265265265265
IR-VerClaim:	0.46296296296296297
IR-VerClaim+Title:	0.53003003003003

MAP@3
-----------------------
IR-Title:	0.180347013680347
IR-VerClaim:	0.522022022022022
IR-VerClaim+Title:	0.5972639305972639

MAP@5
-----------------------
IR-Title:	0.18895562228895563
IR-VerClaim:	0.5335835835835835
IR-VerClaim+Title:	0.6076743410076743

MAP@10
-----------------------
IR-Title:	0.19600830989719878
IR-VerClaim:	0.5412694440472218
IR-VerClaim+Title:	0.6155341849786294

MAP@20
-----------------------
IR-Title:	0.2003622460450806
IR-VerClaim:	0.5448288357774081
IR-VerClaim+Title:	0.618109341271106

MAP@10381
-----------------------
IR-Title:	0.20468726478057028
IR-VerClaim:	0.5473865036470026
IR-VerClaim+Title:	0.6198888377921553


In [8]:
for k in [1, 3, 5, 10, 20, 50]:
    print('\nHasPositives@{}\n-----------------------'.format(k))
    print('IR-Title:\t{}'.format(HasPositives(ranks_title, k)))
    print('IR-VerClaim:\t{}'.format(HasPositives(ranks_verclaim, k)))
    print('IR-VerClaim+Title:\t{}'.format(HasPositives(ranks_both, k)))


HasPositives@1
-----------------------
IR-Title:	0.15315315315315314
IR-VerClaim:	0.46346346346346345
IR-VerClaim+Title:	0.5305305305305306

HasPositives@3
-----------------------
IR-Title:	0.21421421421421422
IR-VerClaim:	0.5885885885885885
IR-VerClaim+Title:	0.6716716716716716

HasPositives@5
-----------------------
IR-Title:	0.25325325325325326
IR-VerClaim:	0.6386386386386387
IR-VerClaim+Title:	0.7177177177177178

HasPositives@10
-----------------------
IR-Title:	0.30430430430430433
IR-VerClaim:	0.6956956956956957
IR-VerClaim+Title:	0.7747747747747747

HasPositives@20
-----------------------
IR-Title:	0.3663663663663664
IR-VerClaim:	0.7457457457457457
IR-VerClaim+Title:	0.8098098098098098

HasPositives@50
-----------------------
IR-Title:	0.43543543543543545
IR-VerClaim:	0.8018018018018018
IR-VerClaim+Title:	0.8408408408408409
