In [1]:
import os 
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [2]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
from textblob import TextBlob

import torch
from transformers import BertTokenizer, BertModel



In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_bbc_sport():
    url = 'https://www.bbc.com/sport'

    # Send a GET request to the URL
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all div elements with type "article"
    article_elements = soup.find_all('div', {'type': 'article'})

    # Extract links, headlines, and game names
    data = []
    for article_element in article_elements:
        link_element = article_element.find('a')
        if link_element:
            link = link_element['href']
            full_link = f'http://bbc.com{link}'

            headline_element = article_element.find('p')
            headline = headline_element.get_text(strip=True) if headline_element else ""

            # Extract the sport name from the href
            sport_name = link.split('/')[-2]

            data.append({'sport': sport_name, 'headline': headline, 'weblink': full_link})

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Filter out news articles
    df = df[df['sport'] != 'news']

    # Categorize Africa-related articles as "African Sports"
    df['sport'] = df['sport'].apply(lambda x: 'african-sports' if x == 'africa' else x)

    return df

In [5]:
def retrieve_articles_by_weblinks(df, sport):
    sport_articles = df[df['sport'] == sport]
    weblinks = sport_articles['weblink'].tolist()

    # Create a new DataFrame to store the articles
    articles_data = {'title': [], 'article': []}

    for weblink in weblinks:
        # Send a GET request to the weblink
        response = requests.get(weblink)

        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all p elements with class containing "$paragraph" and data-reactid containing "$paragraph"
        paragraphs = soup.find_all('p', attrs={"data-reactid": lambda value: value and "$paragraph" in value})

        # Extract and store text from each matching p element
        article_text = "\n".join(paragraph.get_text(strip=True) for paragraph in paragraphs)
        
        # Extract the title from the original DataFrame based on the weblink
        headline = df[df['weblink'] == weblink]['headline'].iloc[0]

        articles_data['title'].append(headline)
        articles_data['article'].append(article_text)

    # Create a new DataFrame from the collected data
    articles_df = pd.DataFrame(articles_data)

    return articles_df

In [6]:
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

In [7]:
def newsAnalysis(dataDiX):
    sent = []
    sentVal = []

    for news in list(dataDiX.values()):
        qq = get_sentiment(news)
        sentVal.append(qq)
        sentiment = find_Sentiment(qq)
        sent.append(sentiment)

    return sent, sentVal

In [8]:
def find_Sentiment(polarity):
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

In [9]:
def calculate_overall_sentiment(article_text):

    paragraphs = article_text.split('\n')
    paragraph_sentiments = [get_sentiment(paragraph) for paragraph in paragraphs if paragraph.strip()]
    
    if paragraph_sentiments:
        overall_sentiment = sum(paragraph_sentiments) / len(paragraph_sentiments)
        return overall_sentiment
    else:
        return 0 #if no paraghraph found

In [10]:
def initialize_bert_model():
    # Load pre-trained BERT model and tokenizer
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Set the model to evaluation mode
    model.eval()

    return model, tokenizer

In [11]:
def sentence_summarization_bert(text, model, tokenizer, num_paragraphs=2):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

    # Forward pass through the model
    outputs = model(**inputs)

    # Get the hidden states from the model output
    hidden_states = outputs.last_hidden_state

    # Calculate the mean of the embeddings for each token to obtain a sentence representation
    sentence_embeddings = torch.mean(hidden_states, dim=1)

    # Summarization: Select a subset of sentences based on some criteria (e.g., sentence importance)

    # For example, let's select the top 3 sentences based on the first token's attention weight
    #attention_weights = outputs.attentions[0][0][0]  # Get attention weights for the first token
    importance_scores = sentence_embeddings[:, 0]
    top_sentences_indices = importance_scores.argsort(descending=True)[:3]  # Select the indices of the top 3 sentences

    # Extract the top sentences from the original text
    top_sentences = [tokenizer.decode(inputs.input_ids[0][idx].item()) for idx in top_sentences_indices]

    return top_sentences


In [None]:
def chat_bot():
    print("Welcome to the Sports News Chatbot!")
    print("Fetching the latest sports news...\n")

    # Get the latest sports news
    sports_news_df = scrape_bbc_sport()

    # Display available sports categories
    sports_categories = sports_news_df['sport'].unique()
    print("Available sports categories:")
    for idx, category in enumerate(sports_categories, start=1):
        print(f"{idx}. {category.capitalize()}")

    # User interaction loop
    while True:
        user_input = input("\nEnter the number of the sport category you are interested in (type 'exit' to quit): ")

        if user_input.lower() == 'exit':
            print("Goodbye! Have a great day.")
            break

        try:
            selection_index = int(user_input)
            if 1 <= selection_index <= len(sports_categories):
                selected_category = sports_categories[selection_index - 1]
                selected_articles = retrieve_articles_by_weblinks(sports_news_df, selected_category)

                # Extract titles and stories as a dictionary
                titles_and_stories = dict(zip(selected_articles['title'], selected_articles['article']))

                # Perform sentiment analysis
                sentiment_labels, sentiment_values = newsAnalysis(titles_and_stories)

                print(f"\nSelected Articles with Sentiment Analysis:")
                for idx, (title, sentiment_label, sentiment_value) in enumerate(zip(selected_articles['title'], sentiment_labels, sentiment_values), start=1):
                    print(f"\n{idx}.'{title}' : {sentiment_label} | Sentiment Value: {sentiment_value:.2f}")

                headline_index = input("\nEnter the number of the headline to read the article (type 'back' to go back): ")

                if headline_index.lower() == 'back':
                    continue

                try:
                    headline_index = int(headline_index) - 1
                    selected_headline = selected_articles.iloc[headline_index]
                   
                # Integrate BERT for sentence summarization
                    bert_model, bert_tokenizer = initialize_bert_model()
                    summary_sentences = sentence_summarization_bert(selected_headline['article'], bert_model, bert_tokenizer, num_paragraphs=2)
                    
                    print("\nSelected Article Summary Sentences:")
                    for sentence in summary_sentences:
                        print(sentence)
                        
                    print(f"\nReading article for '{selected_headline['title']}':\n")
                    #print(f"\nSummary for '{selected_headline['title']}':\n{article_summary}")

                except (ValueError, IndexError):
                    print("Invalid index. Please enter a valid headline number.")
            else:
                print("Invalid index. Please select a valid sports category.")
        except ValueError:
            print("Invalid input. Please enter a number or 'exit'.")

if __name__ == "__main__":
    chat_bot()

Welcome to the Sports News Chatbot!
Fetching the latest sports news...

Available sports categories:
1. Football
2. Sport
3. Tennis
4. African-sports
5. Snooker
6. Rugby-union
7. Cricket
8. Basketball
9. American-football
10. Horse-racing
11. Rugby-league
12. Boxing
13. Ice-hockey

Enter the number of the sport category you are interested in (type 'exit' to quit): 1

Selected Articles with Sentiment Analysis:

1.'Wales must settle for play-offs after Turkey draw' : Positive | Sentiment Value: 0.09

2.'Croatia secure final automatic Euro 2024 spot' : Positive | Sentiment Value: 0.07

3.'Newcastle cleared to sign players from PIF-owned Saudi clubs' : Positive | Sentiment Value: 0.02

4.'Germany lose in Austria as Nagelsmann woe continues' : Negative | Sentiment Value: -0.01

5.'France's perfect qualifying ended by Greece draw' : Positive | Sentiment Value: 0.08

6.'Thousands of fans watch Australia beat Palestine' : Positive | Sentiment Value: 0.13

7.'Mead returns to England squad after