In [1]:
# preprocess Links

In [36]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [37]:
import requests
from bs4 import BeautifulSoup
import time

def get_clean_data(url):
    try :
        content = requests.get(url)
        content = content.text
        
    except(ConnectionError, Exception):
        content = ''
        
    bs = BeautifulSoup(content,'html.parser')
    texts = bs.findAll(['title','p','strong','li',re.compile('^h[1-6]$')])
    return texts


In [38]:
import re
def remove_tags(content):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', content)
    
    return cleantext

In [39]:
def remove_punctuation(content):
    content = re.sub("_", "", content)
    content = re.sub("[^\w\s]", "", content)
    content = re.sub(' +', ' ', content)
    content = re.sub("\n", "", content)
    return content

In [40]:
def convert_to_vector(content,vocabulary):
    vector_dict = dict.fromkeys(vocabulary,0)
    doc = nlp(content)

    for token in doc:
        if token.text.lower() in vector_dict:
            vector_dict[token.text.lower()] += 1
    
    return vector_dict.values()

In [41]:
def create_vocabulary(list_of_queries):
    vocabulary = set()
    for query in list_of_queries:
        words = query.lower().split()
        vocabulary.update(words)
    
    return vocabulary   

In [42]:
import pandas as pd

In [43]:
dataset = pd.read_csv('Queries.csv', skipinitialspace = True, quotechar = '"')

In [44]:
vocab = create_vocabulary(list(set(dataset['Query'])))

In [45]:
from multiprocessing import  Pool
import numpy as np
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [46]:
def clean_url(df):
    df['text'] = df['URL'].apply(lambda x : remove_punctuation(remove_tags(str(get_clean_data(x)))).lower())
    return df

In [48]:
dataset_with_text = parallelize_dataframe(dataset,clean_url)

In [49]:
def bag_of_word_df(dataset, vocab):
    dataset = pd.concat([dataset,pd.DataFrame(columns=list(vocab))], sort=False)
    for feature in vocab:
        dataset[feature] = dataset['text'].apply(lambda x: x.count(' ' + feature + ' '))
    return dataset

In [50]:
vector_df = bag_of_word_df(dataset_with_text, vocab)

In [93]:
queries = set(vector_df.Query)

# Query Likelihood Evaluation Baseline

In [94]:
queries

{"AJ Green's Health",
 'Abraham Lincoln Gettysburg',
 'Battles fought by Guru Gobind Singh',
 'Bill Simmons Roger Goodell',
 'Celtics 11/20/2019',
 'Elon musk interview',
 'Elon musk news',
 'Elon musk speech',
 'Facts about Sushruta',
 "Gandhi's views on truth",
 'Gladiator Russell Crowe Quotes',
 'JFK space race',
 "Jack Ma's\xa0 and Elon Musk's views on AI",
 'Jeff Bezos portrayed by top business magazines like Forbes',
 'Kim Jong-un relationship with China',
 'Leonardo da Vinci Quotes',
 'Mohammad Ali',
 'Mohammad Ali Sting like a bee',
 "Narendra Modi's view on Indian Economy",
 'President Obama Inauguration',
 "Sundar pichai's views on H-1B visa policy by the Trump Government",
 'Warren Buffet views and media interactions in regard to start-ups',
 'World Media views on Imran Khan',
 'elon musk interview with nyt',
 'elon musk latest',
 'elon musk on clean energy',
 'elon musk on self driving cars',
 'elon musk twitter',
 'quotes and opinions of elon musk, as attributed to them in

In [51]:
import numpy as np
def get_likelihood_score(row, D):
    tot = 0
    for term in row['Query'].lower().split():
        tot += np.log((row[term] + 1) /(D+ len(row.keys())-1))
    return tot

In [52]:
vector_df['score'] = [get_likelihood_score(row,len(vector_df)) for ind, row in vector_df.iterrows()]

In [None]:
"""
Note: This code is not necissary 
"""
# def score_fix(row):
#     score = row['score']
#     if vocab_df['score'].quantile(.2) > score:
#         return 0
#     elif vocab_df['score'].quantile(.4) > score:
#         return 1
#     elif vocab_df['score'].quantile(.6) > score:
#         return 2
#     elif vocab_df['score'].quantile(.8) > score:
#         return 3
#     else:
#         return 4
#
# vocab_df["fitted_score"]=vocab_df.apply(score_fix, axis=1)

In [92]:
temp

Unnamed: 0,Query,URL,score,Ranking,Place
0,Abraham Lincoln Gettysburg,http://www.abrahamlincolnonline.org/lincoln/sp...,-4.462883,1.0,0
12,Abraham Lincoln Gettysburg,https://quod.lib.umich.edu/j/jala/2629860.0016...,-8.004438,0.0,1
8,Abraham Lincoln Gettysburg,https://www.nationalaffairs.com/publications/d...,-10.390536,4.0,2
11,Abraham Lincoln Gettysburg,https://www.theatlantic.com/magazine/archive/2...,-11.171876,4.0,3
1,Abraham Lincoln Gettysburg,https://www.history.com/topics/american-civil-...,-11.535668,4.0,4
16,Abraham Lincoln Gettysburg,https://www.wuwm.com/post/seven-facts-you-didn...,-12.645569,0.0,5
14,Abraham Lincoln Gettysburg,https://dp.la/primary-source-sets/battle-of-ge...,-13.818051,4.0,6
6,Abraham Lincoln Gettysburg,https://usa.usembassy.de/etexts/democrac/25.htm,-14.041194,3.0,7
13,Abraham Lincoln Gettysburg,https://www.thedailybeast.com/how-abraham-linc...,-14.079415,0.0,8
17,Abraham Lincoln Gettysburg,http://gettysburg.stonesentinels.com/other-mon...,-14.146555,3.0,9


# NDCG Evaluation

In [179]:
def dcg(ranks, i):
    value = ranks[0]
    if i == 0:
        return value
    else:
        for x in range(1,i+1):
            value += (ranks[x])/(np.log2(x+1))
        return value

In [180]:
for query in queries:
    temp = vector_df[vector_df['Query'] == query].sort_values(by=['score'], ascending=False)[['Query','URL','score', 'Ranking']]
    
    # ----------------------------------
    y = [] 
    for x in range(len(temp)):
        y.append(x)
    temp['QL_rank'] = y
    
    # ----------------------------------
    dcg_values = [0]*len(temp)
    for i in range(len(temp)):
        dcg_values[i] = dcg(list(temp.Ranking), i)
    
    # ----------------------------------
    temp['DCG'] = dcg_values
    if query == 'Celtics 11/20/2019':
        temp.to_csv('Results/Celtic Query.csv')
    else:
        temp.to_csv('Results/' + str(query) + '.csv')


In [186]:
testing = list(temp.Ranking).sort()