In [1]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def retrieve_docs_and_clean():

  r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')
  soup = BeautifulSoup(r.content, 'html.parser')

  #THE FOLLOWING CODE NEED TO BE MODIFIED TO SUITE FOR THE ABOVE URL
  link = []
  for i in soup.find('div', {'class':'lst-pg_hd'}).find_all('a',{'class':'lst-pg_ttl'}):
      i['href'] ='https://sports.ndtv.com/'+ i['href'] + '?page=all'
      link.append(i['href'])
  

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'sp-cn pg-str-com js-ad-section'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

In [3]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
abandoned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096285,0.0,0.0,0.0,0.0
about,0.020478,0.0,0.0,0.0,0.0,0.0,0.0,0.058954,0.056906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
above,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
absent,0.0,0.0,0.0,0.002417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
accepting,0.0,0.043506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.shape

(1824, 18)

In [5]:
def get_similar_articles(q, df):
  print("query:", q)
  print("The following are articles with the highest cosine similarity values: ")
  print()
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    if v != 0.0:
      print("Similarity Values:", v)
      print(docs[k])
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
The following are articles with the highest cosine similarity values: 

Similarity Values: 0.003643523535500569
poland captain robert lewandowski refused to confirm if he had played his last ever game at the world cup after his side were knocked out of the tournament in qatar in a last defeat by france on sunday barcelona striker lewandowski scored a late consolation from the penalty spot for a poland side who were outclassed by the fearsome french attack in doha he will be almost by the time the next world cup comes around in north america in but he suggested that issues beyond his physical condition were more likely to see him end his international career window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js physically i m not afraid of this but we have so many different things outside of football whether

In [6]:
from gensim.summarization.bm25 import BM25

def simple_tok(sent:str):
    return sent.split()

def bm25_similar_articles(query):
  print("query:", query)
  print("The following are articles with the highest BM25 scores: ")
  print()
  tok_corpus = [simple_tok(s) for s in docs]
  query = simple_tok(query)
  bm25 = BM25(tok_corpus)
  scores = bm25.get_scores(query, average_idf = 100)
  best_docs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
  for i, b in enumerate(best_docs):
      print(f"rank {i+1}: {docs[b]}")
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'


bm25_similar_articles(q1)
print('-'*100)
bm25_similar_articles(q2)
print('-'*100)
bm25_similar_articles(q3)
print('-'*100)

query: barcelona
The following are articles with the highest BM25 scores: 

rank 1: france s bid to retain the world cup continues against poland on sunday while england s pursuit of a first major trophy in years will be tested by african champions senegal didier deschamps french side are aiming to become the first team to win successive world cups since brazil in and after winning group d in qatar window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js led by electric paris saint germain forward kylian mbappe the french are heavy favourites to progress past poland at doha s al thumama stadium but in a world cup packed with shocks like the group stage exits of germany and belgium subduing barcelona striker robert lewandowski is the key to france s hopes of avoiding another upset poland s all time leading scorer will be hoping