In [1]:
# preprocess Links

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [3]:
import requests
from bs4 import BeautifulSoup
import time

def get_clean_data(url):
    try :
        content = requests.get(url)
        content = content.text
        
    except(ConnectionError, Exception):
        content = ''
        
    bs = BeautifulSoup(content,'html.parser')
    texts = bs.findAll(['title','p','strong','li',re.compile('^h[1-6]$')])
    return texts


In [4]:
import re
def remove_tags(content):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', content)
    
    return cleantext

In [5]:
def remove_punctuation(content):
    content = re.sub("_", "", content)
    content = re.sub("[^\w\s]", "", content)
    content = re.sub(' +', ' ', content)
    content = re.sub("\n", "", content)
    return content

In [6]:
def convert_to_vector(content,vocabulary):
    vector_dict = dict.fromkeys(vocabulary,0)
    doc = nlp(content)

    for token in doc:
        if token.text.lower() in vector_dict:
            vector_dict[token.text.lower()] += 1
    
    return vector_dict.values()

In [7]:
def create_vocabulary(list_of_queries):
    vocabulary = set()
    for query in list_of_queries:
        words = query.lower().split()
        vocabulary.update(words)
    
    return vocabulary   

In [8]:
import pandas as pd

In [10]:
dataset = pd.read_csv('Queries.csv', skipinitialspace = True, quotechar = '"')

In [12]:
vocab = create_vocabulary(list(set(dataset['Query'])))

In [14]:
from multiprocessing import  Pool
import numpy as np
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [15]:
def clean_url(df):
    df['text'] = df['URL'].apply(lambda x : remove_punctuation(remove_tags(str(get_clean_data(x)))).lower())
    return df

In [16]:
dataset_with_text = parallelize_dataframe(dataset,clean_url)

In [20]:
def bag_of_word_df(dataset, vocab):
    dataset = pd.concat([dataset,pd.DataFrame(columns=list(vocab))], sort=False)
    for feature in vocab:
        dataset[feature] = dataset['text'].apply(lambda x: x.count(' ' + feature + ' '))
    return dataset

In [21]:
vector_df = bag_of_word_df(dataset_with_text, vocab)

# Query Likelihood Evaluation Baseline

In [28]:
import numpy as np
def get_likelihood_score(row, D):
    tot = 0
    for term in row['Query'].lower().split():
        tot += np.log((row[term] + 1) /(D+ len(row.keys())-1))
    return tot

In [29]:
vector_df['score'] = [get_likelihood_score(row,len(vector_df)) for ind, row in vector_df.iterrows()]

In [34]:
vector_df[vector_df['Query'] == 'Abraham Lincoln Gettysburg'].sort_values(by=['score'], ascending=False)[['Query','URL','score', 'Ranking']]

Unnamed: 0,Query,URL,score,Ranking
0,Abraham Lincoln Gettysburg,http://www.abrahamlincolnonline.org/lincoln/sp...,-3.405624,1.0
12,Abraham Lincoln Gettysburg,https://quod.lib.umich.edu/j/jala/2629860.0016...,-6.947179,0.0
8,Abraham Lincoln Gettysburg,https://www.nationalaffairs.com/publications/d...,-9.333277,4.0
11,Abraham Lincoln Gettysburg,https://www.theatlantic.com/magazine/archive/2...,-10.114617,4.0
1,Abraham Lincoln Gettysburg,https://www.history.com/topics/american-civil-...,-10.478409,4.0
16,Abraham Lincoln Gettysburg,https://www.wuwm.com/post/seven-facts-you-didn...,-11.58831,0.0
14,Abraham Lincoln Gettysburg,https://dp.la/primary-source-sets/battle-of-ge...,-12.760792,4.0
6,Abraham Lincoln Gettysburg,https://usa.usembassy.de/etexts/democrac/25.htm,-12.983935,3.0
13,Abraham Lincoln Gettysburg,https://www.thedailybeast.com/how-abraham-linc...,-13.022157,0.0
17,Abraham Lincoln Gettysburg,http://gettysburg.stonesentinels.com/other-mon...,-13.089296,3.0


In [None]:
"""
Note: This code is not necissary 
"""
# def score_fix(row):
#     score = row['score']
#     if vocab_df['score'].quantile(.2) > score:
#         return 0
#     elif vocab_df['score'].quantile(.4) > score:
#         return 1
#     elif vocab_df['score'].quantile(.6) > score:
#         return 2
#     elif vocab_df['score'].quantile(.8) > score:
#         return 3
#     else:
#         return 4
#
# vocab_df["fitted_score"]=vocab_df.apply(score_fix, axis=1)