# Cricket Content Recommendation System

### 1) Scraping ESPN Cricinfo - DO NOT RUN

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [5]:
def get_articles(url, base_url = None, start_page = 1, num_pages = 1):
    articles = []
    for page_num in range(start_page, start_page + num_pages + 1):
        if page_num % 10 == 0:
            print(page_num)
        new_url = f"{url}?page={page_num}"
        response = requests.get(new_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        page_articles = scrape_page(soup, base_url)
        articles.extend(page_articles)
    
    return articles

def scrape_page(soup, base_url = None):
    articles = []
    titles = []
    urls = []
    summaries = []
    dates = []
    
    # getting title and link
    for article in soup.find_all('h2', class_ = 'ds-text-title-s ds-font-bold ds-text-typo'):
        #print(article)
        title = article.text.strip()
        titles.append(title)

    
        link_tag = article.find_parent('a')
        link = link_tag['href'] if link_tag else ""
        if base_url:
            link = base_url + link
        urls.append(link)
        
    
    # getting summaries
    for article in soup.find_all('p', class_ = 'ds-text-compact-s ds-text-typo-mid2 ds-mt-1'):
        #print(article)
        summary = article.text.strip()
        summaries.append(summary)
        
    #getting publication date
    for article in soup.find_all('div', class_ = 'ds-leading-[0] ds-text-typo-mid3 ds-mt-1'):
        #print((article.text.split())[0])
        date_text = article.text.strip()
        date = date_text.split('•')[0]
        dates.append(date)
        
    for title, url, summary, date in zip(titles, urls, summaries, dates):
        articles.append({
          'title': title,
          'link': url,
          'summary': summary,
          'date': date
        })
    
    return articles    

In [78]:
url = 'https://www.espncricinfo.com/ci/content/story/news.html'
base_url = 'https://www.espncricinfo.com'
#url = 'https://www.espncricinfo.com/cricket-news'
articles = get_articles(url, base_url, 1, 1000)

In [79]:
print(len(articles))
display(articles[100:120])

19992


[{'title': "Green confident of 'plugging holes' with versatile role for Australia",
  'link': 'https://www.espncricinfo.com/story/t20-world-cup-2024-australia-s-cameron-green-confident-of-plugging-holes-with-versatile-role-1436699',
  'summary': 'The allrounder may not make the Australia XI initially but could potentially squeeze Marcus Stoinis out of the side',
  'date': '04-Jun-2024'},
 {'title': 'New Zealand battle rain and jetlag ahead of crunch World Cup opener',
  'link': 'https://www.espncricinfo.com/story/t20-world-cup-2024-new-zealand-battle-rain-and-jetlag-ahead-of-crunch-world-cup-opener-1436685',
  'summary': "Coach Gary Stead hopes the team's late start to the tournament will enable them to learn about conditions",
  'date': '04-Jun-2024'},
 {'title': 'Gurbaz, Ibrahim and Farooqi hand Uganda a thrashing',
  'link': 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/afghanistan-vs-uganda-5th-match-group-c-1415705/match-report',
  'summary': 'Gurbaz an

In [80]:
articles_df = pd.DataFrame(articles)
articles_df.to_csv('cricinfo_articles.csv', index = False)

In [8]:
url = 'https://www.espncricinfo.com/ci/content/story/news.html'
base_url = 'https://www.espncricinfo.com'
more_articles = get_articles(url, base_url, 1001, 300)

1010
1020
1030
1040
1050
1060
1070
1080
1090
1100
1110
1120
1130
1140
1150
1160
1170
1180
1190
1200
1210
1220
1230
1240
1250
1260
1270
1280
1290
1300


In [9]:
more_articles_df = pd.DataFrame(more_articles)
more_articles_df.to_csv('cricinfo_articles_2.csv', index = False)

In [10]:
print(len(more_articles))

6000


In [11]:
articles_3 = get_articles(url, base_url, 1301, 200)

1310
1320
1330
1340
1350
1360
1370
1380
1390
1400
1410
1420
1430
1440
1450
1460
1470
1480
1490
1500


In [12]:
articles_3_df = pd.DataFrame(articles_3)
articles_3_df.to_csv('cricinfo_articles_3.csv', index = False)

In [3]:
df_1 = pd.read_csv('cricinfo_articles.csv')
df_2 = pd.read_csv('cricinfo_articles_2.csv')
df_3 = pd.read_csv('cricinfo_articles_3.csv')

In [7]:
display(df_1.head())
print(len(df_1))

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-2024
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-2024
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",09-Jun-2024
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,09-Jun-2024
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,09-Jun-2024


19992


In [6]:
display(df_2.head())
print(len(df_2))

Unnamed: 0,title,link,summary,date
0,Alex Carey for 'injured' Rishabh Pant not a st...,https://www.espncricinfo.com/story/alex-carey-...,Absence of a back-up Indian wicketkeeper has a...,12-Oct-2020
1,Queensland hunt innings victory after Michael ...,https://www.espncricinfo.com/series/sheffield-...,Tasmania face a tough task to save the game on...,12-Oct-2020
2,Michael Neser and Ashton Agar achieve rare dou...,https://www.espncricinfo.com/story/sheffield-s...,Neser followed a five-wicket haul with a centu...,12-Oct-2020
3,Alistair Nicholson steps down as Australian Cr...,https://www.espncricinfo.com/story/alistair-ni...,Nicholson's six-year stint included the Newlan...,11-Oct-2020
4,Rishabh Pant sidelined from IPL 2020 'for a we...,https://www.espncricinfo.com/story/ipl-2020-de...,"'He is going to be resting for a week,' says t...",11-Oct-2020


6000


In [8]:
df = pd.concat([df_1, df_2, df_3])
display(df.head())
print(len(df))

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-2024
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-2024
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",09-Jun-2024
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,09-Jun-2024
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,09-Jun-2024


29988


In [9]:
df.to_csv('articles_full.csv')

### 2) Data Inspection - Don't need to run

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('articles_full.csv')

In [3]:
display(df.head())

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-24
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-24
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",9-Jun-24
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,9-Jun-24
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,9-Jun-24


In [4]:
len(df)

29988

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29988 entries, 0 to 29987
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    29988 non-null  object
 1   link     29988 non-null  object
 2   summary  29988 non-null  object
 3   date     29982 non-null  object
dtypes: object(4)
memory usage: 937.2+ KB
None


In [6]:
display(df.describe())

Unnamed: 0,title,link,summary,date
count,29988,29988,29988,29982
unique,29943,29923,29876,2047
top,Mohammad Nabi's 12-ball 43* maintains Kent's p...,https://www.espncricinfo.com/series/ipl-2020-2...,"Updates, colour and analysis with ESPNcricinfo...",30-Jul-19
freq,2,2,8,36


In [7]:
print(df.isnull().sum())

title      0
link       0
summary    0
date       6
dtype: int64


In [32]:
df = df.drop_duplicates()
df = df.dropna().reset_index()

In [33]:
df = df.drop('index', axis = 1)

In [10]:
print(df.shape)

(29943, 4)


In [11]:
display(df.head())

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-24
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-24
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",9-Jun-24
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,9-Jun-24
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,9-Jun-24


In [12]:
# converting all text to lower case
df['title'] = df['title'].str.lower()
df['summary'] = df['summary'].str.lower()

### 3) Text Preprocessing - Don't need to run

In [15]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import wordnet

In [17]:
nltk.download("stopwords", quiet = True)
nltk.download("wordnet", quiet = True)
nltk.download("punkt", quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)
lemmatizer = WordNetLemmatizer()
english_stopwords = set(nltk.corpus.stopwords.words('english'))

In [21]:
def clean_text(text):
    # clean the input string by converting it to lower case, removing 's and apostrophe.
    
    text = text.lower()
    # Remove 's 
    text = re.sub(r"'s(\s|$)", r'\1', text)
    # Remove '
    text = text.replace("'", "")
    # whitespace
    text = text.strip()
    
    return text

def tokenize(cleaned_text):
    # Tokenize the input string.
    
    tokens = nltk.word_tokenize(cleaned_text)
    # tokenizing non alpha-numeric characters
    new_tokens = []
    for token in tokens:
        split_token = re.split(r'[^0-9a-zA-Z]+', token)
        # remove empty strings
        split_token = [token for token in split_token if token]
        new_tokens.extend(split_token)
    
    return new_tokens

def lemmatize(tokens, stopwords = {}):
    # Lemmatize each token in an input list of tokens
    
    lemmatized_tokens = []
    for token in tokens:
        # getting the first POS tag and checking if it is a Noun
        tag = nltk.pos_tag([token])[0][1]
        if tag.startswith('J'):
            tag = wordnet.ADJ
        elif tag.startswith('V'):
            tag = wordnet.VERB
        elif tag.startswith('R'):
            tag = wordnet.ADV
        else:
            tag = wordnet.NOUN
        
        # lemmatizing
        lemmatized = lemmatizer.lemmatize(token, pos = tag) # first letter
        if (lemmatized not in stopwords) and (len(lemmatized) >= 2):
            lemmatized_tokens.append(lemmatized)
            
    return lemmatized_tokens

def preprocess_text(text, stopwords = {}):
    # preprocessing text
    cleaned_text = clean_text(text)
    tokens = tokenize(cleaned_text)
    return lemmatize(tokens, stopwords)
    

In [22]:
df['title'] = df['title'].apply(preprocess_text)
df['summary'] = df['summary'].apply(preprocess_text)

In [26]:
def join_tokens(tokens):
    return ' '.join(tokens)

In [27]:
df['title'] = df['title'].apply(join_tokens)
df['summary'] = df['summary'].apply(join_tokens)

In [28]:
display(df.head())

Unnamed: 0,title,link,summary,date
0,lamichhane land in west indie will play nepal ...,https://www.espncricinfo.com/story/t20-world-c...,the icc be yet to release statement about whet...,10-Jun-24
1,kirsten bemoans pakistan poor decision make,https://www.espncricinfo.com/story/icc-men-s-t...,player of the match bumrah lauds india bowler ...,10-Jun-24
2,rohit on bumrah he genius with the ball,https://www.espncricinfo.com/story/t20-world-c...,whoever have the ball in hand want to make con...,9-Jun-24
3,babar we be not up to the mark,https://www.espncricinfo.com/story/t20-world-c...,pakistan captain say lack of rhythm upfront wi...,9-Jun-24
4,bumrah spearhead india defence of 119 pakistan...,https://www.espncricinfo.com/series/icc-men-s-...,the trend of low score continued in new york i...,9-Jun-24


In [29]:
df.to_csv('tokenized_articles.csv')

### 4) Feature Extraction

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.decomposition import LatentDirichletAllocation

from gensim.models import Word2Vec

import pandas as pd
import numpy as np
import scipy.sparse as sp



In [4]:
tokenized_df = pd.read_csv('tokenized_articles.csv')

In [5]:
display(df.head())
display(tokenized_df.head())

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-24
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-24
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",9-Jun-24
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,9-Jun-24
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,9-Jun-24


Unnamed: 0.1,Unnamed: 0,title,link,summary,date
0,0,lamichhane land in west indie will play nepal ...,https://www.espncricinfo.com/story/t20-world-c...,the icc be yet to release statement about whet...,10-Jun-24
1,1,kirsten bemoans pakistan poor decision make,https://www.espncricinfo.com/story/icc-men-s-t...,player of the match bumrah lauds india bowler ...,10-Jun-24
2,2,rohit on bumrah he genius with the ball,https://www.espncricinfo.com/story/t20-world-c...,whoever have the ball in hand want to make con...,9-Jun-24
3,3,babar we be not up to the mark,https://www.espncricinfo.com/story/t20-world-c...,pakistan captain say lack of rhythm upfront wi...,9-Jun-24
4,4,bumrah spearhead india defence of 119 pakistan...,https://www.espncricinfo.com/series/icc-men-s-...,the trend of low score continued in new york i...,9-Jun-24


#### Count Vectorizer

In [6]:
def dummy(x):
    return x

def count_vectorizer(text):
    # Computes the term-frequency matrices for the text
    vectorizer = CountVectorizer(analyzer = str.split, tokenizer = str.split, preprocessor = dummy)
    tf_text = vectorizer.fit_transform(text)
    features = vectorizer.get_feature_names_out().tolist()
    
    return tf_text, features

In [8]:
tf_title, features_title_tf = count_vectorizer(tokenized_df['title'])
tf_summary, features_summary_tf = count_vectorizer(tokenized_df['summary'])

In [9]:
display(tokenized_df.shape)

(29943, 5)

In [10]:
print(tf_title.shape, len(features_title_tf), tf_summary.shape, len(features_summary_tf))

(29943, 11922) 11922 (29943, 13162) 13162


In [11]:
display(features_title_tf[-5:])

['zouks', 'zoysa', 'zubayr', 'zulfiqar', 'zyl']

#### TF-IDF Vectorizer

In [7]:
def tfidf_vectorizer(text):
    # Computes the TF-IDF matrices for the text
    vectorizer = TfidfVectorizer(analyzer = str.split, tokenizer = str.split, preprocessor = dummy)
    tf_text = vectorizer.fit_transform(text)
    features = vectorizer.get_feature_names_out().tolist()
    
    return tf_text, features

In [8]:
tfidf_title, features_title_tfidf = tfidf_vectorizer(tokenized_df['title'])
tfidf_summary, features_summary_tfidf = tfidf_vectorizer(tokenized_df['summary'])

In [9]:
print(tfidf_title.shape, len(features_title_tfidf), tfidf_summary.shape, len(features_summary_tfidf))

(29943, 11922) 11922 (29943, 13162) 13162


In [10]:
# converting tf-idf matrix from sparse to dense
tfidf_title_dense = tfidf_title.toarray()
tfidf_summary_dense = tfidf_summary.toarray()

In [11]:
# making dfs for titles and summaries
df_title_tfidf = pd.DataFrame(tfidf_title_dense, columns=features_title_tfidf)
df_summary_tfidf = pd.DataFrame(tfidf_summary_dense, columns=features_summary_tfidf)

In [12]:
vectorized_df = pd.concat([df_title_tfidf, df_summary_tfidf], axis=1)

In [13]:
vectorized_df['original_title'] = df['title']
vectorized_df['original_summary'] = df['summary']
vectorized_df['link'] = df['link']

In [14]:
display(vectorized_df.head())

Unnamed: 0,00,000,07,09,10,100,1000,1000th,100k,100th,...,zorzi,zouks,zoysa,zubayr,zulfiqar,zulu,zunaid,zyl,original_title,original_summary
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Lamichhane lands in West Indies, will play Nep...",The ICC is yet to release a statement about wh...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kirsten bemoans Pakistan's 'poor decision-making',Player of the Match Bumrah lauds India's bowle...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Rohit on Bumrah: 'He's a genius with the ball',"""Whoever had the ball in hand wanted to make a..."
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Babar: 'We were not up to the mark',Pakistan captain says lack of rhythm upfront w...
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bumrah spearheads India's defence of 119; Paki...,The trend of low scores continued in New York ...


In [15]:
display(df.head())

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-24
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-24
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",9-Jun-24
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,9-Jun-24
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,9-Jun-24


In [None]:
np.save('vectorized_df.npy', vectorized_df)

In [None]:
print(vectorized_df.shape)

#### Topic Modeling

In [17]:
def top_words_by_topic(text, n_topics = 10, n_top_words = 20, seed = 42):
    # performing topic modeling and returning the most frequent words in each topic
    
    tf_text, features = count_vectorizer(text)
    
    # performing LDA
    lda = LatentDirichletAllocation(n_components = n_topics, random_state = seed, learning_method = 'online')
    lda.fit(tf_text)
    topics = lda.components_
    
    # identifying n top words
    top_words = []
    for i in range(n_topics):
        # sorting elements of the topic vector in ascending order
        indices = topics[i].argsort()
        # extracting n_top_words 
        top_indices = indices[-n_top_words:]
        words = [features[j] for j in top_indices]
        top_words.append(words)
    
    return top_words

In [None]:
top_words_title = top_words_by_topic(tokenized_df['title'])
top_words_summary = top_words_by_topic(tokenized_df['summary'])

In [20]:
print(top_words_title)

[['warner', 'want', 'root', 'put', 'cricket', 'and', 'top', 'covid', 'joe', 'season', 'ipl', 'at', 'in', 'after', 'of', 'the', 'day', 'final', 'on', 'to'], ['claim', 'plan', 'babar', 'out', 'international', 'be', 'change', 'short', 'lose', 'can', 'join', '2021', 'could', 'essex', '2020', 'from', 'call', 'injury', 'by', 'cricket'], ['title', 'up', 'all', 'take', 'round', 'chase', 'seal', 'fifty', 'star', 'ton', 'over', 'and', 'into', 'australia', 'the', 'victory', 'in', 'lead', 'win', 'to'], ['mohammad', 'knight', 'bcci', 'two', 'team', 'of', 'name', 'mumbai', 'head', 'royal', 'and', 'england', 'player', 'but', 'no', 'woman', 'in', 'coach', 'to', 'for'], ['big', 'up', 'first', 'australia', 'on', 'series', 'it', 'england', 'squad', 'set', 'indie', 'west', 'india', 'africa', 'and', 'south', 'in', 'test', 'for', 'to'], ['pant', 'ahmed', 'zimbabwe', 'ranking', 'replaces', 'among', 'work', 'injured', 'taylor', 'tim', 'shakib', 'and', 'gloucestershire', 'hit', 'sussex', 'in', 'icc', 'wicket',

In [21]:
print(top_words_summary)

[['all', 'not', 'they', 'at', 'team', 'that', 'their', 'say', 'will', 'of', 'but', 'for', 'have', 'on', 'it', 'and', 'in', 'to', 'be', 'the'], ['team', 'series', 'player', 'india', 'his', 'cup', 'from', 'new', 'with', 'world', 'on', 'will', 'and', 'for', 'have', 'to', 'in', 'of', 'be', 'the'], ['host', 'out', 'an', 'at', 'by', 'inning', 'of', 'the', 'with', 'lead', 'first', 'back', 'after', 'wicket', 'day', 'run', 'to', 'on', 'in', 'for'], ['blast', 'and', 'fixture', 'summer', 'surrey', 'ecb', 'suffer', 'white', 'available', 'championship', 'ash', 'county', 'through', 'add', 'after', 'month', 'expect', 'for', 'england', 'tour'], ['before', 'lose', 'win', 'end', 'only', 'after', 'left', 'over', 'score', 'their', 'chase', 'the', 'and', 'with', 'up', 'wicket', 'ball', 'in', 'for', 'to'], ['home', 'ball', 'need', 'from', 'win', 'south', 'side', 'over', 'but', 'with', 'indie', 'west', 'go', 'and', 'of', 'batsman', 'three', 'to', 'in', 'the'], ['mohammad', 'fight', 'offspinner', 'fell', 'win

### 5) Recommendation - Content-Based Filtering

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
# computing cosine similarity
numerical_df = vectorized_df.drop(['link', 'original_title', 'original_summary'], axis = 1)
numerical_df.head()

Unnamed: 0,00,000,07,09,10,100,1000,1000th,100k,100th,...,zone,zoom,zorzi,zouks,zoysa,zubayr,zulfiqar,zulu,zunaid,zyl
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# caching for the future
np.save('numerical_df.npy', numerical_df)

In [21]:
cosine_sim = cosine_similarity(numerical_df)

In [22]:
# caching for the future
np.save('cosine_sim.npy', cosine_sim)

In [16]:
import numpy as np
cosine_sim = np.load('cosine_sim.npy')

In [25]:
numerical_df = np.load('numerical_df.npy')

#### Generating Recommendations

In [44]:
def get_recommendations(article_id, cosine_sim = cosine_sim, top_n = 5):
    scores = list(enumerate(cosine_sim[article_id]))
    scores = sorted(scores, key = lambda x: x[1], reverse = True)
    scores = scores[1 : top_n + 1]
    article_indices = [i[0] for i in scores]
    return article_indices

In [42]:
rec = get_recommendations(4212)
print(rec)

[2541, 6135, 3964, 26239, 17781]


In [43]:
rec_titles = df.loc[rec, 'title']
display(rec_titles)

2541     Harris and Bancroft audition for Australia's o...
6135     Renshaw's chance to push Ashes claims, eyes al...
3964     An opportunity for Cameron Bancroft to improve...
26239      Bancroft, Labuschagne get chances for Ashes bid
17781        David Warner returns to New South Wales squad
Name: title, dtype: object

In [41]:
display(df.loc[4212, 'title'])

"Bancroft gets chance to audition for Warner's position"

In [46]:
# caching the top 20 articles for each article
n = cosine_sim.shape[0]
top_20_recs = np.zeros((n, 20), dtype = int)

for i in range(n):
    if i % 100 == 0:
        print(i)
    top_20_recs[i] = get_recommendations(i, top_n = 20)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [47]:
print(top_20_recs[:5])

[[  178 22940  8923  6777   977   299 13111  6829 24205  6536 27497   626
  22340 19207  3649  4133 16253  4970 20822  6591]
 [27980   419  6367 18316 27327  5512 17469 19643 25006 12951  8209   678
  27422    22 25817 18116  7847 12553 13608 13560]
 [21203 19178  7234  3298 28259  2030 26575 27844 27360  8349  7993 19574
   8548 13814 26848  2019  8507  1744 24044 27195]
 [   58 29123 22922  7732  4263 27737  7544  2777  2750  3510 17582  2837
    850 29238  7320  9780 16504  3278  7993 28762]
 [ 5494 24622  6507 27612  4866  3045  3063 14639 24227 13809  2790  6825
  28464  1088  6522   169 21209  3044  7066 11778]]


In [48]:
np.save('top_20_recommendations.npy', top_20_recs)