# A. Crawler

In [142]:
import feedparser
import pandas as pd

In [143]:
LINK = "https://sports.yahoo.com/nba/rss/"
feed = feedparser.parse(LINK)

feed_len = len(feed.entries) #number of news in feed
old_news = 0  # count how many news in feed were already scraped

print("There are {} news in the RSS feed." .format(feed_len))

There are 49 news in the RSS feed.


In [144]:
#meta_data = pd.read_csv(PATH + "database.csv", index_col = 'Unnamed: 0')
final_data = pd.DataFrame(columns=['ID', 'Title', 'Summary', 'Link', 'Published'])
final_data.head()

Unnamed: 0,ID,Title,Summary,Link,Published


In [145]:
def article_entry(entry, ID):
  ID = ID
  Title = entry.title
  Summary = entry.summary
  Link = entry.link
  Published = str(entry.published_parsed.tm_mday) + '/' + \
              str(entry.published_parsed.tm_mon) + '/' + \
              str(entry.published_parsed.tm_year)
  return [ID, Title, Summary, Link, Published]

In [146]:
test_entry = feed.entries[0]
article_entry(test_entry, 1)

[1,
 'King of Texas: Doncic drops 41 in Houston',
 'Every Mavericks game brings a new milestone for Luka Doncic. Sunday he became the second-youngest player in NBA history to top 30 points in 4 straight games.',
 'https://sports.yahoo.com/luka-doncic-mavericks-rockets-durant-harden-westbrook-235024032.html?src=rss',
 '24/11/2019']

In [147]:
data = [] #dataframe for saving the entries
n = len(final_data)+1 #ID value based on the highest ID value in database
for i in range(len(feed.entries)):
  entry = feed.entries[i]
  
  #check that link isn't in the database yet
  if entry.link not in final_data['Link'].values:
    processed = article_entry(entry = entry, ID=n)
    data.append(processed)
    n += 1 #increase the ID value
  else: old_news += 1 #count already scraped entries

In [148]:
if len(data) > 0:
  #transform data to pandas DataFrame
  news_extracted = pd.DataFrame(data, columns=['ID', 'Title', 'Summary', 'Link', 'Published'])

  #add new news to the database
  final_data = pd.concat([final_data, news_extracted], axis = 0)

  #write database to a csv file
  #meta_data.to_csv(PATH + "database.csv")

In [149]:
final_data.head(10)

Unnamed: 0,ID,Title,Summary,Link,Published
0,1,King of Texas: Doncic drops 41 in Houston,Every Mavericks game brings a new milestone fo...,https://sports.yahoo.com/luka-doncic-mavericks...,24/11/2019
1,2,Kelly Oubre dunks on Mason Plumlee with flair ...,"The last time the Suns played the Nuggets, Kel...",https://sports.yahoo.com/kelly-oubre-dunks-mas...,25/11/2019
2,3,Spencer Dinwiddie thriving as starter with Kyr...,Dinwiddie is averaging 25 points and 6.2 assis...,https://sports.yahoo.com/spencer-dinwiddie-thr...,25/11/2019
3,4,Luka Doncic takes playful shot at Dirk Nowitzk...,Doncic joked no Nowitzki helps the defense.,https://sports.yahoo.com/luka-doncic-takes-pla...,25/11/2019
4,5,Three Things to Know: Luka Doncic is putting u...,"Doncic is averaging 30.6 points, 10.1 rebounds...",https://sports.yahoo.com/three-things-know-luk...,25/11/2019
5,6,Luka Doncic shines ... again,"While Luka Doncic went off in Houston, Spencer...",https://sports.yahoo.com/luka-doncic-shines-ag...,25/11/2019
6,7,"Nets face Cavaliers, hope to keep rolling with...",As the Brooklyn Nets wait for Kyrie Irving to ...,https://sports.yahoo.com/nets-face-cavaliers-h...,25/11/2019
7,8,"Losers of 3 of 4, Celtics face rematch with Kings",Few times is a doubtful designation for a game...,https://sports.yahoo.com/losers-3-4-celtics-fa...,25/11/2019
8,9,Must-See: Devin Booker prints poster on Paul M...,,https://sports.yahoo.com/dunk-devin-booker-sec...,25/11/2019
9,10,Harrell ties career high with 34 as Clippers r...,Doc Rivers has seen the Los Angeles Clippers r...,https://sports.yahoo.com/harrell-ties-career-h...,25/11/2019


In [150]:
final_data.tail(5)

Unnamed: 0,ID,Title,Summary,Link,Published
44,45,Watch Luka Doncic drop 41 on Rockets in 137-12...,James Harden scored 32 points.,https://sports.yahoo.com/watch-luka-doncic-dro...,25/11/2019
45,46,"GAME RECAP: Mavericks 137, Rockets 123","Luka Doncic drops 41 points, adds 10 assists a...",https://sports.yahoo.com/game-recap-mavericks-...,24/11/2019
46,47,Jordan McRae explains how he plays with the lo...,McRae caused the pin in his surgically repaire...,https://sports.yahoo.com/jordan-mcrae-explains...,24/11/2019
47,48,Jarrett Allen goes up to slam the alley-oop in...,,https://sports.yahoo.com/jarrett-allen-goes-sl...,24/11/2019
48,49,Highlights: Luka Doncic | Mavericks vs. Rockets,,https://sports.yahoo.com/highlights-luka-donci...,24/11/2019


In [151]:
print("{} % of article entries have already scraped." .format((old_news/feed_len)*100))

0.0 % of article entries have already scraped.


# B. Indexer

In [152]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 
import string

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/qab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/qab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/qab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/qab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [153]:
entry = final_data.loc[0,:].copy()

In [154]:
entry

ID                                                           1
Title                King of Texas: Doncic drops 41 in Houston
Summary      Every Mavericks game brings a new milestone fo...
Link         https://sports.yahoo.com/luka-doncic-mavericks...
Published                                           24/11/2019
Name: 0, dtype: object

## Text Preprocessing

In [155]:
def article_string(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [156]:
article_string(entry.Summary)

'every mavericks game brings a new milestone for luka doncic sunday he became the secondyoungest player in nba history to top 30 points in 4 straight games'

In [157]:
def find_wordnet_point(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [158]:
print ("Camelo: {}\n Swimming: {}\n Angry: {}".format(find_wordnet_point("camelo"), find_wordnet_point("swimming"), find_wordnet_point("angry")))

Camelo: n
 Swimming: v
 Angry: a


In [159]:
stop_words = stopwords.words("english")

In [160]:
lem = WordNetLemmatizer()

def stop_lemmatize(doc):
    tokens = nltk.word_tokenize(doc)
    tmp = ""
    for w in tokens:
        if w not in stop_words:
            tmp += lem.lemmatize(w, find_wordnet_point(w)) + " "
    return tmp

In [161]:
stop_lemmatize(doc = entry.Title)

'King Texas : Doncic drop 41 Houston '

In [162]:
def article_string(text):
  text = text.lower() #to lowercase
  text = text.translate(str.maketrans('', '', string.punctuation)) #strip punctuation
  text = stop_lemmatize(text)
  return text

In [163]:
%time article_string(entry.Title)

CPU times: user 2.57 ms, sys: 1.31 ms, total: 3.87 ms
Wall time: 3.15 ms


'king texas doncic drop 41 houston '

In [164]:
final_processed = final_data.copy()

In [165]:
def change_df(df):
  df['Title'] = df['Title'].apply(article_string)
  df['Summary'] = df['Summary'].apply(article_string)

In [166]:
%time change_df(final_processed)

CPU times: user 195 ms, sys: 31.7 ms, total: 227 ms
Wall time: 239 ms


In [167]:
final_processed.head(10)

Unnamed: 0,ID,Title,Summary,Link,Published
0,1,king texas doncic drop 41 houston,every maverick game brings new milestone luka ...,https://sports.yahoo.com/luka-doncic-mavericks...,24/11/2019
1,2,kelly oubre dunk mason plumlee flair video,last time sun played nugget kelly oubre get fi...,https://sports.yahoo.com/kelly-oubre-dunks-mas...,25/11/2019
2,3,spencer dinwiddie thrive starter kyrie irving,dinwiddie average 25 point 62 assist game irving,https://sports.yahoo.com/spencer-dinwiddie-thr...,25/11/2019
3,4,luka doncic take playful shot dirk nowitzki ta...,doncic joked nowitzki help defense,https://sports.yahoo.com/luka-doncic-takes-pla...,25/11/2019
4,5,three thing know luka doncic put age 20 stats ...,doncic average 306 point 101 rebound 98 assist...,https://sports.yahoo.com/three-things-know-luk...,25/11/2019
5,6,luka doncic shine,luka doncic go houston spencer dinwiddie montr...,https://sports.yahoo.com/luka-doncic-shines-ag...,25/11/2019
6,7,net face cavalier hope keep roll without irving,brooklyn net wait kyrie irving recover right s...,https://sports.yahoo.com/nets-face-cavaliers-h...,25/11/2019
7,8,loser 3 4 celtic face rematch king,time doubtful designation game deem miracle th...,https://sports.yahoo.com/losers-3-4-celtics-fa...,25/11/2019
8,9,mustsee devin booker print poster paul millsap,,https://sports.yahoo.com/dunk-devin-booker-sec...,25/11/2019
9,10,harrell tie career high 34 clipper rout pelican,doc river see los angeles clipper roster under...,https://sports.yahoo.com/harrell-ties-career-h...,25/11/2019


In [168]:
final_processed['text'] = final_processed['Title'] + " " + final_processed['Summary']
drop_cols = ['Title', 'Summary', 'Published', 'Link']
final_processed = final_processed.drop(drop_cols, axis=1)

In [169]:
final_processed.head(10)

Unnamed: 0,ID,text
0,1,king texas doncic drop 41 houston every maver...
1,2,kelly oubre dunk mason plumlee flair video la...
2,3,spencer dinwiddie thrive starter kyrie irving ...
3,4,luka doncic take playful shot dirk nowitzki ta...
4,5,three thing know luka doncic put age 20 stats ...
5,6,luka doncic shine luka doncic go houston spen...
6,7,net face cavalier hope keep roll without irvin...
7,8,loser 3 4 celtic face rematch king time doubt...
8,9,mustsee devin booker print poster paul millsap
9,10,harrell tie career high 34 clipper rout pelica...


In [170]:
def change_df(df):
  df = df
  df['Title'] = df['Title'].apply(article_string)
  df['Summary'] = df['Summary'].apply(article_string)
  df['text'] = df['Title'] + " " + df['Summary']
  drop_cols = ['Title', 'Summary', 'Published', 'Link']
  df = df.drop(drop_cols, axis=1)
  return df

## Build Index

In [171]:
entry = final_processed.loc[0,:].copy()
print(entry)
index_test = {}

ID                                                      1
text    king texas doncic drop 41 houston  every maver...
Name: 0, dtype: object


In [172]:
words = entry.text.split()
ID = entry.ID

In [173]:
word = words[0]
sample = {word: [ID]}
print(sample)

{'king': [1]}


In [174]:
for word in words:
  if word in index_test.keys():
    if ID not in index_test[word]:
      index_test[word].append(ID)
  else:
    index_test[word] = [ID]

In [175]:
print(index_test)

{'king': [1], 'texas': [1], 'doncic': [1], 'drop': [1], '41': [1], 'houston': [1], 'every': [1], 'maverick': [1], 'game': [1], 'brings': [1], 'new': [1], 'milestone': [1], 'luka': [1], 'sunday': [1], 'become': [1], 'secondyoungest': [1], 'player': [1], 'nba': [1], 'history': [1], 'top': [1], '30': [1], 'point': [1], '4': [1], 'straight': [1]}


In [176]:
def index_it(entry, index):
  words = entry.text.split()
  ID = entry.ID
  for word in words:
    if word in index.keys():
      if ID not in index[word]:
        index[word].append(ID)
    else:
      index[word] = [ID]
  return index

In [177]:
ind = index_it(entry=entry, index= {})
print(ind)

{'king': [1], 'texas': [1], 'doncic': [1], 'drop': [1], '41': [1], 'houston': [1], 'every': [1], 'maverick': [1], 'game': [1], 'brings': [1], 'new': [1], 'milestone': [1], 'luka': [1], 'sunday': [1], 'become': [1], 'secondyoungest': [1], 'player': [1], 'nba': [1], 'history': [1], 'top': [1], '30': [1], 'point': [1], '4': [1], 'straight': [1]}


In [178]:
def index_all(df, index):
  for i in range(len(df)):
    entry = df.loc[i,:]
    index = index_it(entry = entry, index = index)
  return index

In [179]:
index = index_all(final_processed, index = {})
len(index)

483

In [180]:
def build_index(df, index):
    to_add = change_df(df)
    index = index_all(df = to_add, index = index)
    return index

In [181]:
idx = build_index(df = final_data, index = {})

In [182]:
len(idx)

483

In [183]:
import json

with open('index.json', 'w') as fp:
    json.dump(idx, fp, sort_keys=True, indent=4)

In [184]:
with open('index.json', 'r') as f:
    data = json.load(f)

## Ranked Retrieval

In [185]:
import gensim
import numpy as np
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

--2019-11-25 15:59:41--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.38.150
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.38.150|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz.2’


2019-11-25 16:31:10 (415 KB/s) - Read error at byte 674229465/1647046227 (Connection reset by peer). Retrying.

--2019-11-25 16:31:11--  (try: 2)  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.38.150|:443... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 1647046227 (1.5G), 972816762 (928M) remaining [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz.2’


2019-11-25 17:01:08 (372 KB/s) - Read error at byte 1243845784/1647046227 (Connection reset by pee

In [212]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [213]:
print(words)

['king', 'texas', 'doncic', 'drop', '41', 'houston', 'every', 'maverick', 'game', 'brings', 'new', 'milestone', 'luka', 'doncic', 'sunday', 'become', 'secondyoungest', 'player', 'nba', 'history', 'top', '30', 'point', '4', 'straight', 'game']


In [214]:
def average_vectors(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    if len(doc) == 0:
      return np.zeros(300)
    else:
      return np.mean(word2vec_model[doc], axis=0)

In [215]:
%time test_vec = average_vectors(word2vec, words)

CPU times: user 1.23 ms, sys: 12.9 ms, total: 14.1 ms
Wall time: 39.2 ms


In [216]:
def prepare_ranking(df):
  corpus = df[['ID', 'text']].copy()
  doc_vecs = {}
  for i in range(len(corpus)):
    row = corpus.loc[i,:]
    text = row.text.split()
    doc_vecs[row.ID]=average_vectors(word2vec, text)
  doc_vecs = pd.DataFrame.from_dict(data=doc_vecs, orient="index")
  doc_vecs['ID'] = doc_vecs.index
  return doc_vecs

In [217]:
doc_vecs = prepare_ranking(df=final_data)

# Query Processing

In [218]:
test = "Doncic 41 Texas"

In [219]:
print("User query: {}." .format(test))
test_norm = article_string(test)
print("Normalized query: {}." .format(test_norm))

User query: Doncic 41 Texas.
Normalized query: doncic 41 texas .


In [220]:
test_split = test_norm.split()

In [221]:
def article_query(query):
    norm = article_string(query)
    return norm.split()

### Retrieve from the index

In [222]:
retrieved = []
for word in test_split:
  if word in index.keys():
    retrieved.append(index[word])

In [223]:
def lists_intersection(lists):
  intersect = list(set.intersection(*map(set, lists)))
  intersect.sort()
  return intersect

In [224]:
result = lists_intersection(retrieved)
print(result)

[1]


In [225]:
def search_vengine(query, index=idx):
  query_split = article_query(query)
  retrieved = []
  for word in query_split:
    if word in index.keys():
      retrieved.append(index[word])
  if len(retrieved)>0:
    result = lists_intersection(retrieved)
  else:
      result = [0]
  return result

In [226]:
result_IDs = search_vengine("Mavericks", index)
print(result_IDs)

[1, 41, 45, 46, 49]


In [227]:
#in real setting we'll read the database from file here
#meta = pd.read_csv("database.csv")

#this is our database
final = final_data.drop(['text'], axis=1).copy()
final.head(5)

Unnamed: 0,ID,Title,Summary,Link,Published
0,1,king texas doncic drop 41 houston,every maverick game brings new milestone luka ...,https://sports.yahoo.com/luka-doncic-mavericks...,24/11/2019
1,2,kelly oubre dunk mason plumlee flair video,last time sun played nugget kelly oubre get fi...,https://sports.yahoo.com/kelly-oubre-dunks-mas...,25/11/2019
2,3,spencer dinwiddie thrive starter kyrie irving,dinwiddie average 25 point 62 assist game irving,https://sports.yahoo.com/spencer-dinwiddie-thr...,25/11/2019
3,4,luka doncic take playful shot dirk nowitzki ta...,doncic joked nowitzki help defense,https://sports.yahoo.com/luka-doncic-takes-pla...,25/11/2019
4,5,three thing know luka doncic put age 20 stats ...,doncic average 306 point 101 rebound 98 assist...,https://sports.yahoo.com/three-things-know-luk...,25/11/2019


In [228]:
def connect_id_df(retrieved_id, df):
    return df[df.ID.isin(retrieved_id)].reset_index(drop=True)

In [229]:
result_final = connect_id_df(result_IDs, final)
result_final.head(5)

Unnamed: 0,ID,Title,Summary,Link,Published
0,1,king texas doncic drop 41 houston,every maverick game brings new milestone luka ...,https://sports.yahoo.com/luka-doncic-mavericks...,24/11/2019
1,41,doncic hardaway jr lead mavs 137123 rout rocket,dallas coach rick carlisle ’ want team get ove...,https://sports.yahoo.com/doncic-hardaway-jr-le...,25/11/2019
2,45,watch luka doncic drop 41 rocket 137123 maveri...,james harden score 32 point,https://sports.yahoo.com/watch-luka-doncic-dro...,25/11/2019
3,46,game recap maverick 137 rocket 123,luka doncic drop 41 point add 10 assist 6 rebo...,https://sports.yahoo.com/game-recap-mavericks-...,24/11/2019
4,49,highlight luka doncic maverick v rocket,,https://sports.yahoo.com/highlights-luka-donci...,24/11/2019


### Ranked Retrieval

In [230]:
query_vec = average_vectors(word2vec, test_split)

In [231]:
result_vecs = connect_id_df(result_IDs, doc_vecs)

In [232]:
def cos_similarity(a, b):
  dot = np.dot(a, b)
  norma = np.linalg.norm(a)
  normb = np.linalg.norm(b)
  cos = dot / (norma * normb)
  return(cos)

In [233]:
cos_sim = []
for i in range(len(result_vecs)):
  doc_vec = result_vecs.loc[i,:].drop(['ID'])
  cos_sim.append(cos_similarity(doc_vec, query_vec))
result_final['rank'] = cos_sim

In [234]:
result_final.sort_values('rank', axis = 0)

Unnamed: 0,ID,Title,Summary,Link,Published,rank
4,49,highlight luka doncic maverick v rocket,,https://sports.yahoo.com/highlights-luka-donci...,24/11/2019,0.140618
2,45,watch luka doncic drop 41 rocket 137123 maveri...,james harden score 32 point,https://sports.yahoo.com/watch-luka-doncic-dro...,25/11/2019,0.261327
1,41,doncic hardaway jr lead mavs 137123 rout rocket,dallas coach rick carlisle ’ want team get ove...,https://sports.yahoo.com/doncic-hardaway-jr-le...,25/11/2019,0.358058
3,46,game recap maverick 137 rocket 123,luka doncic drop 41 point add 10 assist 6 rebo...,https://sports.yahoo.com/game-recap-mavericks-...,24/11/2019,0.375821
0,1,king texas doncic drop 41 houston,every maverick game brings new milestone luka ...,https://sports.yahoo.com/luka-doncic-mavericks...,24/11/2019,0.517244


In [248]:
def rank_results(query, results):
  query_norm = article_query(query)
  query_vec = average_vectors(word2vec, query_norm)
  result_vecs = connect_id_df(results.ID, doc_vecs)
  cos_sim = []
  for i in range(len(result_vecs)):
    doc_vec = result_vecs.loc[i,:].drop(['ID'])
    cos_sim.append(cos_similarity(doc_vec, query_vec))
  results['rank'] = cos_sim
  results = results.sort_values('rank', axis=0)
  return results

In [249]:
final_result = rank_results("Mavericks", result_final)

### Date filtering

In [250]:
test = "25/11/2019"

#get news published on "test"
results_single = result_final[result_final.Published==test].reset_index(drop=True)
results_single.head()

Unnamed: 0,ID,Title,Summary,Link,Published,rank
0,41,doncic hardaway jr lead mavs 137123 rout rocket,dallas coach rick carlisle ’ want team get ove...,https://sports.yahoo.com/doncic-hardaway-jr-le...,25/11/2019,0.275744
1,45,watch luka doncic drop 41 rocket 137123 maveri...,james harden score 32 point,https://sports.yahoo.com/watch-luka-doncic-dro...,25/11/2019,0.32524


In [253]:
#get today's date
from datetime import date, timedelta

def find_today():
  today = date.today()
  today = today.strftime("%d/%m/%Y")
  return [today]

results_today = result_final[result_final.Published.isin(get_today())].reset_index(drop=True)
results_today.head()

Unnamed: 0,ID,Title,Summary,Link,Published,rank
0,41,doncic hardaway jr lead mavs 137123 rout rocket,dallas coach rick carlisle ’ want team get ove...,https://sports.yahoo.com/doncic-hardaway-jr-le...,25/11/2019,0.275744
1,45,watch luka doncic drop 41 rocket 137123 maveri...,james harden score 32 point,https://sports.yahoo.com/watch-luka-doncic-dro...,25/11/2019,0.32524


In [254]:
find_today()

['25/11/2019']

In [255]:
def daterange(start, end):
    for n in range(int ((end - start).days)+1):
        yield start + timedelta(n)

def format_date(dt):
  dt = dt.split("/")
  dt = date(int(dt[2]), int(dt[1]), int(dt[0]))
  return(dt)

def date_interval(interval):
  interval = interval.split("-")
  start = format_date(interval[0])
  end = format_date(interval[1])
  interval = []
  for dt in daterange(start, end):
      interval.append(dt.strftime("%d/%m/%Y"))
  return interval

In [256]:
date_interval("24/11/2019 - 25/11/2019")

['24/11/2019', '25/11/2019']

In [258]:
s = "25/11/2019"
len(s)

10

In [259]:
def filter_date(dat, df):
  if dat == "today":
    dat = get_today()
  if len(dat) == 10:
    dat = [dat]
  if len(dat) > 11:
    dat = date_interval(dat)
  if len(dat) is 0:
    return(df)

  result = df[df.Published.isin(dat)].reset_index(drop=True)
  return result

### Printing results to user

In [270]:
def print_results(result_df):
  for i in range(len(result_df)):
    res = result_df.loc[i, :]
    print(res.Title)
    print(res.Summary)
    if i == len(result_df):
        print(res.Link)
    else:
        print("{}\n" .format(res.Link))

In [271]:
print_results(final_result)

king texas doncic drop 41 houston 
every maverick game brings new milestone luka doncic sunday become secondyoungest player nba history top 30 point 4 straight game 
https://sports.yahoo.com/luka-doncic-mavericks-rockets-durant-harden-westbrook-235024032.html?src=rss

doncic hardaway jr lead mavs 137123 rout rocket 
dallas coach rick carlisle ’ want team get overexcited convincing victory houston rocket sunday luka doncic score 41 point tim hardaway jr add seasonhigh 31 maverick never trail 137123 romp maverick score 45 point first 
https://sports.yahoo.com/doncic-hardaway-jr-lead-mavs-137-123-rout-230403018--nba.html?src=rss

watch luka doncic drop 41 rocket 137123 maverick win 
james harden score 32 point 
https://sports.yahoo.com/watch-luka-doncic-drop-41-000433140.html?src=rss

game recap maverick 137 rocket 123 
luka doncic drop 41 point add 10 assist 6 rebound dallas beat houston 
https://sports.yahoo.com/game-recap-mavericks-137-rockets-235458823.html?src=rss

highlight luka don

### Put it all together

In [272]:
def search(query, dat=None):
  result = search_vengine(query)
  result = connect_id_df(result, final)
  result = rank_results(query, result)

  if dat is not None:
    result = filter_date(dat, result)

  print_results(result)

In [273]:
query = input("What are you looking for:")
dat = input("Date:")
search(query, dat)

What are you looking for:james harden
Date:today
watch luka doncic drop 41 rocket 137123 maverick win 
james harden score 32 point 
https://sports.yahoo.com/watch-luka-doncic-drop-41-000433140.html?src=rss



# Automated crawling for every hour

In [275]:
URLS = ["http://feeds.bbci.co.uk/sport/football/rss.xml?edition=uk",
       "https://www.espn.com/espn/rss/nba/news",
       "https://www.skysports.com/rss/12040",
       "https://www.cbssports.com/rss/headlines/nba/",
       "https://api.foxsports.com/v1/rss?partnerKey=zBaFxRyGKCfxBagJG9b8pqLyndmvo7UU"]

In [278]:
for url in URLS:
  print ("Crawling {}" .format(url))
  #perc, added = crawl(URL=url, PATH=OUTPUT_DIR)
  print("XXX % of entries were already scraped.\n")
  #|update_index_vecs(df=added, index_path=INDEX_PATH, vec_path=VECTOR_PATH)

Crawling http://feeds.bbci.co.uk/sport/football/rss.xml?edition=uk
XXX % of entries were already scraped.

Crawling https://www.espn.com/espn/rss/nba/news
XXX % of entries were already scraped.

Crawling https://www.skysports.com/rss/12040
XXX % of entries were already scraped.

Crawling https://www.cbssports.com/rss/headlines/nba/
XXX % of entries were already scraped.

Crawling https://api.foxsports.com/v1/rss?partnerKey=zBaFxRyGKCfxBagJG9b8pqLyndmvo7UU
XXX % of entries were already scraped.



In [282]:
import time

#while True:
#  for url in URLS:
#   print ("Crawling {}" .format(url))
    #perc, added = crawl(URL=url, PATH=OUTPUT_DIR)
#   print("XXX % of entries were already scraped.\n")
    #update_index_vecs(df=added, index_path=INDEX_PATH, vec_path=VECTOR_PATH)

  #pause for an hour
#  time.sleep(3600)

In [283]:
#At this point I don't know what I'm doing ahaha