In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

In [2]:
tweets = pd.read_csv('data/train/tweets.queries.tsv', sep='\t', header=0, index_col=0)
tweets.sort_index(inplace=True)
tweets.head(5)

Unnamed: 0,tweet_content
1,Trump needs to immediately divest from his bus...
2,A number of fraudulent text messages informing...
3,Fact check: The U.S. Army is NOT contacting an...
4,The US drone attack on #Soleimani caught on ca...
5,1. To the dim witted reporters like @dmedin11:...


In [3]:
vclaims = pd.read_csv('data/verified_claims.docs.tsv', sep='\t', header=0, index_col=0)
vclaims.sort_index(inplace=True)
vclaims.head(5)

Unnamed: 0,vclaim,title
0,122 detainees released from confinement at Gua...,Did 122 Prisoners Released from Guantanamo by ...
1,"A ""Trump and Obama by the Numbers"" meme recoun...",Does This Meme Accurately Show ‘Trump and Obam...
2,"A ""large-scale killing"" of white farmers is ta...",Is a ‘Large-Scale Killing’ of White Farmers Un...
3,"A ""law to separate families"" was enacted prior...",Was the ‘Law to Separate Families’ Passed in 1...
4,"A ""newly uncovered"" photograph reveals Alexand...",Does an Image Show Ocasio-Cortez Fake-Crying a...


In [4]:
qrels = pd.read_table('data/train/tweet-vclaim-pairs.qrels', 
                      names=['tweet_id', '0', 'vclaim_id', 'relevance'], 
                      index_col='tweet_id')
qrels.sort_index(inplace=True)
qrels.head(5)

Unnamed: 0_level_0,0,vclaim_id,relevance
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,395,1
2,0,669,1
3,0,669,1
4,0,144,1
5,0,84,1


In [5]:
stopwords = stopwords.words('english')
url_pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
username_pattern = r"@[^\s]+"
hashtag_pattern = r"\B#\w\w+"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"
# token_pattern = r"\b[A-Za-z0-9][A-Za-z0-9]+\b"

In [6]:
# function of preprocessing tweet
def preprocess_tweet(tweet, 
                     url_pattern=url_pattern, username_pattern=username_pattern, 
                     hashtag_pattern=hashtag_pattern, token_pattern=token_pattern, 
                     remove_url=True, remove_username=True, remove_hashtag=True,
                     stopwords=stopwords, with_stopwordsrm=True, with_stemming=True):
    # remove content after '—'
    tweet = tweet.split('—')[0]
    
    # remove url
    if remove_url == True:
        tweet = re.sub(url_pattern, "", tweet)
        
    # remove @username 
    if remove_username == True:
        tweet = re.sub(username_pattern, "", tweet)
        
    # remove #hashtag
    if remove_hashtag == True:
        tweet = re.sub(hashtag_pattern, "", tweet)
    
    # lower case 
    tweet_lower = tweet.lower()
    
    # tokenization 
    words = re.findall(token_pattern, tweet_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    tweet_processed = " ".join(words)
    
    return tweet_processed


In [7]:
# dict tweets_prep: tweet_id -> tweet_content
tweets_prep = {}
for tweet_id in tweets.index:
    tweets_prep[tweet_id] = preprocess_tweet(tweets.loc[tweet_id, 'tweet_content'])

In [8]:
tweets_prep[106]

'biden grand wizard kkk play lie use vote creator kkk oppos civil right black yup democrat parti'

In [9]:
# function of preprocessing vclaim
def preprocess_text(text, token_pattern=token_pattern, stopwords=stopwords, 
                      with_stopwordsrm=True, with_stemming=True):
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed

In [10]:
# dict vclaim_prep: vlciam_id -> [vlciam_content, vclaim_title]
vclaim_prep = {}
for vclaim_id in vclaims.index:
    vclaim_prep[vclaim_id] = []
    vclaim_prep[vclaim_id].append(preprocess_text(vclaims.loc[vclaim_id, 'vclaim']))
    vclaim_prep[vclaim_id].append(preprocess_text(vclaims.loc[vclaim_id, 'title']))

In [11]:
vclaim_prep[0]

['detaine releas confin guantanamo bay presid obama engag terrorist activ',
 'prison releas guantanamo presid obama return battlefield']

## Computing Cosine Similarity:

In [13]:
# function of computing consine similarity 
vectorizer = TfidfVectorizer()

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [16]:
cos_sims_content = {}
cos_sims_title = {}

for (tweet_id, tweet_content) in tweets_prep.items():
    cos_sims_content[tweet_id] = []
    cos_sims_title[tweet_id] = []
    for vclaim in vclaim_prep.values():
        cos_sims_content[tweet_id].append(cosine_sim(tweet_content, vclaim[0]))
        cos_sims_title[tweet_id].append(cosine_sim(tweet_content, vclaim[1]))

In [25]:
df_cos_sims_content = pd.DataFrame.from_dict(cos_sims_content)
df_cos_sims_title = pd.DataFrame.from_dict(cos_sims_title)

In [26]:
df_cos_sims_content.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,992,993,994,995,996,997,998,999,1001,1002
0,0.0,0.0,0.0,0.0,0.055436,0.03461,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.063355,0.049083,0.0,0.0,0.028343,0.0,0.0,0.0,0.0,0.0,...,0.0,0.063355,0.063355,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.028343,0.0,0.033715,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df_cos_sims_content.to_csv('data/train/cosine_sim_tweet_vclaimcontent.csv')
df_cos_sims_title.to_csv('data/train/cosine_sim_tweet_vclaimtitle.csv')

## Computing BM25 Score:

In [12]:
tokenized_contents = [vclaim[0].split(" ") for vclaim in vclaim_prep.values()]
tokenized_titles = [vclaim[1].split(" ") for vclaim in vclaim_prep.values()]

tokenized_tweets = {}
for (tweet_id, tweet_content) in tweets_prep.items():
    tokenized_tweets[tweet_id] = tweet_content.split(" ")

In [13]:
bm25s_content = {}
bm25s_title = {}
bm25kapi_content = BM25Okapi(tokenized_contents)
bm25kapi_title = BM25Okapi(tokenized_titles)

for (tweet_id, tweet_content) in tokenized_tweets.items():
    bm25s_content[tweet_id] = bm25kapi_content.get_scores(tweet_content)
    bm25s_title[tweet_id] = bm25kapi_title.get_scores(tweet_content)

In [14]:
df_bm25s_content = pd.DataFrame.from_dict(bm25s_content)
df_bm25s_title = pd.DataFrame.from_dict(bm25s_title)

In [16]:
df_bm25s_content.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,992,993,994,995,996,997,998,999,1001,1002
0,0.0,0.0,0.0,0.0,4.508746,6.114621,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.200489,5.15244,0.0,0.0,3.385839,0.0,0.0,0.0,0.0,0.0,...,0.0,1.100244,1.100244,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,5.15244,0.0,4.335054,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_bm25s_content.to_csv('data/train/bm25_tweet_vclaimcontent.csv')
df_bm25s_title.to_csv('data/train/bm25_tweet_vclaimtitle.csv')