# Cricket Content Recommendation System

### 1) Scraping ESPN Cricinfo - DO NOT RUN

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [5]:
def get_articles(url, base_url = None, start_page = 1, num_pages = 1):
    articles = []
    for page_num in range(start_page, start_page + num_pages + 1):
        if page_num % 10 == 0:
            print(page_num)
        new_url = f"{url}?page={page_num}"
        response = requests.get(new_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        page_articles = scrape_page(soup, base_url)
        articles.extend(page_articles)
    
    return articles

def scrape_page(soup, base_url = None):
    articles = []
    titles = []
    urls = []
    summaries = []
    dates = []
    
    # getting title and link
    for article in soup.find_all('h2', class_ = 'ds-text-title-s ds-font-bold ds-text-typo'):
        #print(article)
        title = article.text.strip()
        titles.append(title)

    
        link_tag = article.find_parent('a')
        link = link_tag['href'] if link_tag else ""
        if base_url:
            link = base_url + link
        urls.append(link)
        
    
    # getting summaries
    for article in soup.find_all('p', class_ = 'ds-text-compact-s ds-text-typo-mid2 ds-mt-1'):
        #print(article)
        summary = article.text.strip()
        summaries.append(summary)
        
    #getting publication date
    for article in soup.find_all('div', class_ = 'ds-leading-[0] ds-text-typo-mid3 ds-mt-1'):
        #print((article.text.split())[0])
        date_text = article.text.strip()
        date = date_text.split('•')[0]
        dates.append(date)
        
    for title, url, summary, date in zip(titles, urls, summaries, dates):
        articles.append({
          'title': title,
          'link': url,
          'summary': summary,
          'date': date
        })
    
    return articles    

In [78]:
url = 'https://www.espncricinfo.com/ci/content/story/news.html'
base_url = 'https://www.espncricinfo.com'
#url = 'https://www.espncricinfo.com/cricket-news'
articles = get_articles(url, base_url, 1, 1000)

In [79]:
print(len(articles))
display(articles[100:120])

19992


[{'title': "Green confident of 'plugging holes' with versatile role for Australia",
  'link': 'https://www.espncricinfo.com/story/t20-world-cup-2024-australia-s-cameron-green-confident-of-plugging-holes-with-versatile-role-1436699',
  'summary': 'The allrounder may not make the Australia XI initially but could potentially squeeze Marcus Stoinis out of the side',
  'date': '04-Jun-2024'},
 {'title': 'New Zealand battle rain and jetlag ahead of crunch World Cup opener',
  'link': 'https://www.espncricinfo.com/story/t20-world-cup-2024-new-zealand-battle-rain-and-jetlag-ahead-of-crunch-world-cup-opener-1436685',
  'summary': "Coach Gary Stead hopes the team's late start to the tournament will enable them to learn about conditions",
  'date': '04-Jun-2024'},
 {'title': 'Gurbaz, Ibrahim and Farooqi hand Uganda a thrashing',
  'link': 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/afghanistan-vs-uganda-5th-match-group-c-1415705/match-report',
  'summary': 'Gurbaz an

In [80]:
articles_df = pd.DataFrame(articles)
articles_df.to_csv('cricinfo_articles.csv', index = False)

In [8]:
url = 'https://www.espncricinfo.com/ci/content/story/news.html'
base_url = 'https://www.espncricinfo.com'
more_articles = get_articles(url, base_url, 1001, 300)

1010
1020
1030
1040
1050
1060
1070
1080
1090
1100
1110
1120
1130
1140
1150
1160
1170
1180
1190
1200
1210
1220
1230
1240
1250
1260
1270
1280
1290
1300


In [9]:
more_articles_df = pd.DataFrame(more_articles)
more_articles_df.to_csv('cricinfo_articles_2.csv', index = False)

In [10]:
print(len(more_articles))

6000


In [11]:
articles_3 = get_articles(url, base_url, 1301, 200)

1310
1320
1330
1340
1350
1360
1370
1380
1390
1400
1410
1420
1430
1440
1450
1460
1470
1480
1490
1500


In [12]:
articles_3_df = pd.DataFrame(articles_3)
articles_3_df.to_csv('cricinfo_articles_3.csv', index = False)

In [3]:
df_1 = pd.read_csv('cricinfo_articles.csv')
df_2 = pd.read_csv('cricinfo_articles_2.csv')
df_3 = pd.read_csv('cricinfo_articles_3.csv')

In [7]:
display(df_1.head())
print(len(df_1))

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-2024
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-2024
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",09-Jun-2024
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,09-Jun-2024
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,09-Jun-2024


19992


In [6]:
display(df_2.head())
print(len(df_2))

Unnamed: 0,title,link,summary,date
0,Alex Carey for 'injured' Rishabh Pant not a st...,https://www.espncricinfo.com/story/alex-carey-...,Absence of a back-up Indian wicketkeeper has a...,12-Oct-2020
1,Queensland hunt innings victory after Michael ...,https://www.espncricinfo.com/series/sheffield-...,Tasmania face a tough task to save the game on...,12-Oct-2020
2,Michael Neser and Ashton Agar achieve rare dou...,https://www.espncricinfo.com/story/sheffield-s...,Neser followed a five-wicket haul with a centu...,12-Oct-2020
3,Alistair Nicholson steps down as Australian Cr...,https://www.espncricinfo.com/story/alistair-ni...,Nicholson's six-year stint included the Newlan...,11-Oct-2020
4,Rishabh Pant sidelined from IPL 2020 'for a we...,https://www.espncricinfo.com/story/ipl-2020-de...,"'He is going to be resting for a week,' says t...",11-Oct-2020


6000


In [8]:
df = pd.concat([df_1, df_2, df_3])
display(df.head())
print(len(df))

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-2024
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-2024
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",09-Jun-2024
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,09-Jun-2024
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,09-Jun-2024


29988


In [9]:
df.to_csv('articles_full.csv')

### 2) Data Inspection

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('articles_full.csv')

In [3]:
display(df.head())

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-24
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-24
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",9-Jun-24
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,9-Jun-24
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,9-Jun-24


In [4]:
len(df)

29988

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29988 entries, 0 to 29987
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    29988 non-null  object
 1   link     29988 non-null  object
 2   summary  29988 non-null  object
 3   date     29982 non-null  object
dtypes: object(4)
memory usage: 937.2+ KB
None


In [6]:
display(df.describe())

Unnamed: 0,title,link,summary,date
count,29988,29988,29988,29982
unique,29943,29923,29876,2047
top,Mohammad Nabi's 12-ball 43* maintains Kent's p...,https://www.espncricinfo.com/series/ipl-2020-2...,"Updates, colour and analysis with ESPNcricinfo...",30-Jul-19
freq,2,2,8,36


In [7]:
print(df.isnull().sum())

title      0
link       0
summary    0
date       6
dtype: int64


In [8]:
df = df.drop_duplicates()
df = df.dropna().reset_index()

In [9]:
df = df.drop('index', axis = 1)

In [10]:
print(df.shape)

(29943, 4)


In [11]:
display(df.head())

Unnamed: 0,title,link,summary,date
0,"Lamichhane lands in West Indies, will play Nep...",https://www.espncricinfo.com/story/t20-world-c...,The ICC is yet to release a statement about wh...,10-Jun-24
1,Kirsten bemoans Pakistan's 'poor decision-making',https://www.espncricinfo.com/story/icc-men-s-t...,Player of the Match Bumrah lauds India's bowle...,10-Jun-24
2,Rohit on Bumrah: 'He's a genius with the ball',https://www.espncricinfo.com/story/t20-world-c...,"""Whoever had the ball in hand wanted to make a...",9-Jun-24
3,Babar: 'We were not up to the mark',https://www.espncricinfo.com/story/t20-world-c...,Pakistan captain says lack of rhythm upfront w...,9-Jun-24
4,Bumrah spearheads India's defence of 119; Paki...,https://www.espncricinfo.com/series/icc-men-s-...,The trend of low scores continued in New York ...,9-Jun-24


In [12]:
# converting all text to lower case
df['title'] = df['title'].str.lower()
df['summary'] = df['summary'].str.lower()

### 3) Text Preprocessing

In [15]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import wordnet

In [17]:
nltk.download("stopwords", quiet = True)
nltk.download("wordnet", quiet = True)
nltk.download("punkt", quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)
lemmatizer = WordNetLemmatizer()
english_stopwords = set(nltk.corpus.stopwords.words('english'))

In [21]:
def clean_text(text):
    # clean the input string by converting it to lower case, removing 's and apostrophe.
    
    text = text.lower()
    # Remove 's 
    text = re.sub(r"'s(\s|$)", r'\1', text)
    # Remove '
    text = text.replace("'", "")
    # whitespace
    text = text.strip()
    
    return text

def tokenize(cleaned_text):
    # Tokenize the input string.
    
    tokens = nltk.word_tokenize(cleaned_text)
    # tokenizing non alpha-numeric characters
    new_tokens = []
    for token in tokens:
        split_token = re.split(r'[^0-9a-zA-Z]+', token)
        # remove empty strings
        split_token = [token for token in split_token if token]
        new_tokens.extend(split_token)
    
    return new_tokens

def lemmatize(tokens, stopwords = {}):
    # Lemmatize each token in an input list of tokens
    
    lemmatized_tokens = []
    for token in tokens:
        # getting the first POS tag and checking if it is a Noun
        tag = nltk.pos_tag([token])[0][1]
        if tag.startswith('J'):
            tag = wordnet.ADJ
        elif tag.startswith('V'):
            tag = wordnet.VERB
        elif tag.startswith('R'):
            tag = wordnet.ADV
        else:
            tag = wordnet.NOUN
        
        # lemmatizing
        lemmatized = lemmatizer.lemmatize(token, pos = tag) # first letter
        if (lemmatized not in stopwords) and (len(lemmatized) >= 2):
            lemmatized_tokens.append(lemmatized)
            
    return lemmatized_tokens

def preprocess_text(text, stopwords = {}):
    # preprocessing text
    cleaned_text = clean_text(text)
    tokens = tokenize(cleaned_text)
    return lemmatize(tokens, stopwords)
    

In [22]:
df['title'] = df['title'].apply(preprocess_text)
df['summary'] = df['summary'].apply(preprocess_text)

In [26]:
def join_tokens(tokens):
    return ' '.join(tokens)

In [27]:
df['title'] = df['title'].apply(join_tokens)
df['summary'] = df['summary'].apply(join_tokens)

In [28]:
display(df.head())

Unnamed: 0,title,link,summary,date
0,lamichhane land in west indie will play nepal ...,https://www.espncricinfo.com/story/t20-world-c...,the icc be yet to release statement about whet...,10-Jun-24
1,kirsten bemoans pakistan poor decision make,https://www.espncricinfo.com/story/icc-men-s-t...,player of the match bumrah lauds india bowler ...,10-Jun-24
2,rohit on bumrah he genius with the ball,https://www.espncricinfo.com/story/t20-world-c...,whoever have the ball in hand want to make con...,9-Jun-24
3,babar we be not up to the mark,https://www.espncricinfo.com/story/t20-world-c...,pakistan captain say lack of rhythm upfront wi...,9-Jun-24
4,bumrah spearhead india defence of 119 pakistan...,https://www.espncricinfo.com/series/icc-men-s-...,the trend of low score continued in new york i...,9-Jun-24


In [29]:
df.to_csv('tokenized_articles.csv')

### 4) Feature Extraction

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.decomposition import LatentDirichletAllocation

from gensim.models import Word2Vec

import pandas as pd
import numpy as np
import scipy.sparse as sp



In [31]:
tokenized_df = pd.read_csv('tokenized_articles.csv')

#### Count Vectorizer

In [32]:
def dummy(x):
    return x

def count_vectorizer(text):
    # Computes the term-frequency matrices for the text
    vectorizer = CountVectorizer(analyzer = str.split, tokenizer = str.split, preprocessor = dummy)
    tf_text = vectorizer.fit_transform(text)
    features = vectorizer.get_feature_names_out().tolist()
    
    return tf_text, features

In [51]:
tf_title, features_title_tf = count_vectorizer(tokenized_df['title'])
tf_summary, features_summary_tf = count_vectorizer(tokenized_df['summary'])

In [48]:
display(tokenized_df.shape)

(29943, 5)

In [52]:
print(tf_title.shape, len(features_title_tf), tf_summary.shape, len(features_summary_tf))

(29943, 11922) 11922 (29943, 13162) 13162


In [50]:
display(features_title[-5:])

['zouks', 'zoysa', 'zubayr', 'zulfiqar', 'zyl']

#### TF-IDF Vectorizer

In [55]:
def tfidf_vectorizer(text):
    # Computes the TF-IDF matrices for the text
    vectorizer = TfidfVectorizer(analyzer = str.split, tokenizer = str.split, preprocessor = dummy)
    tf_text = vectorizer.fit_transform(text)
    features = vectorizer.get_feature_names_out().tolist()
    
    return tf_text, features

In [56]:
tfidf_title, features_title_tfidf = tfidf_vectorizer(tokenized_df['title'])
tfidf_summary, features_summary_tfidf = tfidf_vectorizer(tokenized_df['summary'])

In [57]:
print(tfidf_title.shape, len(features_title_tfidf), tfidf_summary.shape, len(features_summary_tfidf))

(29943, 11922) 11922 (29943, 13162) 13162


In [66]:
# converting tf-idf matrix from sparse to dense
tfidf_title_dense = tfidf_title.toarray()
tfidf_summary_dense = tfidf_summary.toarray()

In [70]:
# making dfs for titles and summaries
df_title_tfidf = pd.DataFrame(tfidf_title_dense, columns=features_title_tfidf)
df_summary_tfidf = pd.DataFrame(tfidf_summary_dense, columns=features_summary_tfidf)

In [71]:
vectorized_df = pd.concat([df_title_tfidf, df_summary_tfidf], axis=1)

In [72]:
vectorized_df['original_title'] = tokenized_df['title']
vectorized_df['original_summary'] = tokenized_df['summary']

In [73]:
display(vectorized_df.head())

Unnamed: 0,00,000,07,09,10,100,1000,1000th,100k,100th,...,zorzi,zouks,zoysa,zubayr,zulfiqar,zulu,zunaid,zyl,original_title,original_summary
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lamichhane land in west indie will play nepal ...,the icc be yet to release statement about whet...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,kirsten bemoans pakistan poor decision make,player of the match bumrah lauds india bowler ...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rohit on bumrah he genius with the ball,whoever have the ball in hand want to make con...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,babar we be not up to the mark,pakistan captain say lack of rhythm upfront wi...
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bumrah spearhead india defence of 119 pakistan...,the trend of low score continued in new york i...


In [74]:
print(vectorized_df.shape)

(29943, 25086)


In [None]:
vectorized_df.to_csv('vectorized articles.csv')

#### Topic Modeling

In [63]:
def top_words_by_topic(text, n_topics = 10, n_top_words = 20, seed = 42):
    # performing topic modeling and returning the most frequent words in each topic
    
    tf_text, features = count_vectorizer(text)
    
    # performing LDA
    lda = LatentDirichletAllocation(n_components = n_topics, random_state = seed, learning_method = 'online')
    lda.fit(tf_text)
    topics = lda.components_
    
    # identifying n top words
    top_words = []
    for i in range(n_topics):
        # sorting elements of the topic vector in ascending order
        indices = topics[i].argsort()
        # extracting n_top_words 
        top_indices = indices[-n_top_words:]
        words = [features[j] for j in top_indices]
        top_words.append(words)
    
    return top_words

In [59]:
top_words_title = top_words_by_topic(tokenized_df['title'])
top_words_summary = top_words_by_topic(tokenized_df['summary'])

In [61]:
print(top_words_title)

[['warner', 'want', 'root', 'put', 'cricket', 'and', 'top', 'covid', 'joe', 'season', 'ipl', 'at', 'in', 'after', 'of', 'the', 'day', 'final', 'on', 'to'], ['claim', 'plan', 'babar', 'out', 'international', 'be', 'change', 'short', 'lose', 'can', 'join', '2021', 'could', 'essex', '2020', 'from', 'call', 'injury', 'by', 'cricket'], ['title', 'up', 'all', 'take', 'round', 'chase', 'seal', 'fifty', 'star', 'ton', 'over', 'and', 'into', 'australia', 'the', 'victory', 'in', 'lead', 'win', 'to'], ['mohammad', 'knight', 'bcci', 'two', 'team', 'of', 'name', 'mumbai', 'head', 'royal', 'and', 'england', 'player', 'but', 'no', 'woman', 'in', 'coach', 'to', 'for'], ['big', 'up', 'first', 'australia', 'on', 'series', 'it', 'england', 'squad', 'set', 'indie', 'west', 'india', 'africa', 'and', 'south', 'in', 'test', 'for', 'to'], ['pant', 'ahmed', 'zimbabwe', 'ranking', 'replaces', 'among', 'work', 'injured', 'taylor', 'tim', 'shakib', 'and', 'gloucestershire', 'hit', 'sussex', 'in', 'icc', 'wicket',

In [62]:
print(top_words_summary)

[['all', 'not', 'they', 'at', 'team', 'that', 'their', 'say', 'will', 'of', 'but', 'for', 'have', 'on', 'it', 'and', 'in', 'to', 'be', 'the'], ['team', 'series', 'player', 'india', 'his', 'cup', 'from', 'new', 'with', 'world', 'on', 'will', 'and', 'for', 'have', 'to', 'in', 'of', 'be', 'the'], ['host', 'out', 'an', 'at', 'by', 'inning', 'of', 'the', 'with', 'lead', 'first', 'back', 'after', 'wicket', 'day', 'run', 'to', 'on', 'in', 'for'], ['blast', 'and', 'fixture', 'summer', 'surrey', 'ecb', 'suffer', 'white', 'available', 'championship', 'ash', 'county', 'through', 'add', 'after', 'month', 'expect', 'for', 'england', 'tour'], ['before', 'lose', 'win', 'end', 'only', 'after', 'left', 'over', 'score', 'their', 'chase', 'the', 'and', 'with', 'up', 'wicket', 'ball', 'in', 'for', 'to'], ['home', 'ball', 'need', 'from', 'win', 'south', 'side', 'over', 'but', 'with', 'indie', 'west', 'go', 'and', 'of', 'batsman', 'three', 'to', 'in', 'the'], ['mohammad', 'fight', 'offspinner', 'fell', 'win