
# Evaluation Notebook
---

In [1]:
import pandas as pd
import re
import string
import math
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

---
### Importing the Dataset
---

In [2]:
"https://www.kaggle.com/davidwallach/financial-tweets"

df = pd.read_csv('stockerbot-export(Verified Tweets).csv',error_bad_lines=False)
df.head()

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
0,1.0197e+18,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True
1,1.01971e+18,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True
2,1.01971e+18,Who says the American Dream is dead? https://t...,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True
3,1.01972e+18,Barry Silbert is extremely optimistic on bitco...,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True
4,1.01972e+18,How satellites avoid attacks and space junk wh...,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True


---
### Text Processing
---

In [3]:
import emoji

def remove_emojis(text: str) -> str:
    return ''.join(c for c in text if c not in emoji.UNICODE_EMOJI)

df2 = df.copy(deep=True)
df2['tokens'] = df2['text'] 
df2.tokens = df2.tokens.str.lower()
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'{link}', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r"\[video\]", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'&[a-z]+;', '', x))
# df2.tokens = df2.tokens.apply(lambda x: re.sub(r"[^a-z\s\d\(\-:\)\\\/\!?];='#$]", '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub(r'[@$#]+', '', x))
df2.tokens = df2.tokens.apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))  #Removed Chinese symbols
##Removes all non-english charaters inlcuidng emojis 
df2.tokens = df2.tokens.apply(remove_emojis)

In [4]:
documents = df2['tokens'].values.tolist()  #processed tweets

In [5]:
# tokenize documents (tweets)
document_terms = [doc.split(' ') for doc in documents]


In [6]:
# vectorize and get vocabulary
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(documents)
vocabulary = vectorizer.get_feature_names()

In [7]:
print ('We have a {} document corpus with a {} term vocabulary'.format(*documents_vectorized.shape))

We have a 363 document corpus with a 2563 term vocabulary


In [8]:
# This is what it looks like
df3 = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
doc_ids = df3.index.values
df3[:5]

Unnamed: 0,00,0000,01,02,03,04,05,06,06m,07,...,yoy,ytd,yum,zero,zim,zimbabwe,zolmaxnews,zto,zts,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
### Queries and Real Judgement
---

**Creating the queries**

In [9]:
# In order to evaluate a search engine over this data we need two things:
# 1. Queries
# 2. Relevance Judgements

# QUERIES dictionary with {q_id: query}
queries = dict(enumerate([
    'google amazon',
    'stocks bitcoin',]))

**Creating the real judgment list showing which tweets contain the queries**

In [10]:
### Query 1 - 'google amazon'  ###

df_test = pd.concat([df3['google'], df3['amazon']], axis = 1)

def relevance1(row):
    if row.google + row.amazon > 0: return 1
    else: return 0

    
df_test= df_test.reset_index().assign(q_id = 0).assign(relevance = df_test.apply(relevance1, axis= 1))
df_test.rename(columns={"index": "doc_id"}, inplace= True)
df2_test = df_test.drop(columns=['google', 'amazon'])
df2_test = df2_test[["q_id", "doc_id", "relevance"]]
records = df2_test.to_records(index=False)

# REL JUDGEMENTS list with [(q_id, d_id, judg), ...] judg 0 | 1 with 1 = relevant
list1 = list(records)
list1[:5]

[(0, 0, 0), (0, 1, 0), (0, 2, 0), (0, 3, 0), (0, 4, 0)]

In [11]:
### Query 2 - 'stocks bitcoin'  ###
df_test3 = pd.concat([df3['stocks'], df3['bitcoin']], axis = 1)

def relevance2(row):
    if row.stocks + row.bitcoin > 0: return 1
    else: return 0
    
df_test3 =df_test3.reset_index().assign(q_id = 1).assign(relevance = df_test3.apply(relevance2, axis= 1))
df_q2= df_test3.drop(columns = ['stocks','bitcoin'])
df_q2.rename(columns={"index": "doc_id"}, inplace= True)
df_q2 = df_q2[["q_id", "doc_id", "relevance"]]
records2 = df_q2.to_records(index=False)

# REL JUDGEMENTS list with [(q_id, d_id, judg), ...] judg 0 | 1 with 1 = relevant
list2 = list(records2)
list2[:5]

[(1, 0, 0), (1, 1, 0), (1, 2, 0), (1, 3, 1), (1, 4, 0)]

In [12]:
# The complete real judgemnt list for both queries.
qrels = list1+list2
len(qrels)

726

---
### Implementing BM25
---

In [13]:
def BM25_IDF_df(df):
    """
    This definition calculates BM25-IDF weights before hand as done last week
    """

    dfs = (df > 0).sum(axis=0)
    N = df.shape[0]
    idfs = -np.log(dfs / N)

    k_1 = 1.2
    b = 0.8
    dls = df.sum(axis=1) 
    avgdl = np.mean(dls)

    numerator = np.array((k_1 + 1) * df)
    denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(df)

    BM25_tf = numerator / denominator

    idfs = np.array(idfs)

    BM25_score = BM25_tf * idfs
    return pd.DataFrame(BM25_score, columns=vocabulary)


In [14]:
#Calling the BM25 function
bm25_df = BM25_IDF_df(df3) # a dataframe with BM25-idf weights
# bm25_df

In [15]:
#Function to return ranked documents
def retrieve_ranking(query, bm25_df):
    q_terms = query.split(' ')
    q_terms_only = bm25_df[q_terms]
    score_q_d = q_terms_only.sum(axis=1)
    return sorted(zip(bm25_df.index.values,score_q_d.values), key = lambda tup:tup[1], reverse=True)

---
### Evaluation Functions
---

In [16]:
def precision_at_k(doc_ranking, qrels, query_id, k=5):
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score

    qrels_query = [qrel for qrel in qrels if qrel[0] == query_id]
    relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1]
    non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0]

    TP = len(set(retrieved) & set(relevant_doc_ids))
    FP = len(set(retrieved) & set(non_relevant_doc_ids))
    FN = len(set(relevant_doc_ids) - set(retrieved))

#     try:
    precision = TP / (TP + FP)
#     except ZeroDivisionError:
#         precision = 0
        
    recall = TP / (TP + FN)

    return TP, FP, precision

In [17]:
def f1_score_at_k(doc_ranking, qrels, query_id, k=5):
    # calculate f_1 score
    # hint: you need to find TP's etc in a similar way to precision at k
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score

    qrels_query = [qrel for qrel in qrels if qrel[0] == query_id]
    relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1]
    non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0]

    TP = len(set(retrieved) & set(relevant_doc_ids))
    FP = len(set(retrieved) & set(non_relevant_doc_ids))
    FN = len(set(relevant_doc_ids) - set(retrieved))

#     try:
    precision = TP / (TP + FP)
#     except ZeroDivisionError:
#     precision = 0
        
#     try:
    recall = TP / (TP + FN)
#     except ZeroDivisionError:
#         recall = 0

#     try:
    f1 = 2 * precision * recall / (precision + recall)
#     except ZeroDivisionError:
#         f1 = 0
        
    return f1
    return None


In [21]:
def Recall(doc_ranking, qrels, query_id):
    retrieved = [doc[0] for doc in doc_ranking] # take only the document id, rather than score

    qrels_query = [qrel for qrel in qrels if qrel[0] == query_id]
    relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 1]
    non_relevant_doc_ids = [qrel[1] for qrel in qrels_query if qrel[-1] == 0]

    TP = len(set(retrieved) & set(relevant_doc_ids))
    FP = len(set(retrieved) & set(non_relevant_doc_ids))
    FN = len(set(relevant_doc_ids) - set(retrieved))

    recall = TP / (TP + FN)
#
        
    return recall
    return None

---
### Evaluation Calculations
---

In [19]:
# To retrieve and calculate accuracy metrics for each query lets loop over them
k = 5
for query_id, query in queries.items():
#     print(query_id)
#     print(query)
    doc_ranking = retrieve_ranking(query, bm25_df)
#     print(doc_ranking)

    tp, fp, precision = precision_at_k(doc_ranking, qrels, query_id, k=k)
    f1_score = f1_score_at_k(doc_ranking, qrels, query_id, k=k)
    print('retrieved query "{}" with Precision@{} = {} and F1-score = {}'.format(query, k, precision, f1_score))


retrieved query "google amazon" with Precision@5 = 1.0 and F1-score = 0.47619047619047616
retrieved query "stocks bitcoin" with Precision@5 = 1.0 and F1-score = 0.3448275862068966


In [22]:
for query_id, query in queries.items():
#     print(query_id)
#     print(query)
    doc_ranking = retrieve_ranking(query, bm25_df)
#     print(doc_ranking)

    recall = Recall(doc_ranking, qrels, query_id)
    print('retrieved query "{}" with Recall = {}'.format(query, recall))


retrieved query "google amazon" with Recall = 1.0
retrieved query "stocks bitcoin" with Recall = 1.0
