In [38]:
!pip install feedparser
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy Release
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 257 kB in 2s (137 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to prov

In [39]:
import feedparser
from datetime import datetime, timedelta
import logging
import pandas as pd
from urllib.parse import quote
import time
from bs4 import BeautifulSoup

In [40]:


# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the Google News Feed Scraper class
class GoogleNewsFeedScraper:
    def __init__(self, query, start_date, end_date, language):
        self.query = query
        self.start_date = start_date  # Already a datetime object
        self.end_date = end_date      # Already a datetime object
        self.language = language

    def scrape_google_news_feed(self):
        articles = []
        current_date = self.start_date

        while current_date <= self.end_date:
            encoded_query = quote(self.query)
            rss_url = f'https://news.google.com/rss/search?q={encoded_query}&hl={self.language}&gl=US&ceid=US:{self.language[:2]}'
            feed = feedparser.parse(rss_url)

            if feed.entries:
                for entry in feed.entries:
                    try:
                        # Try to parse the published date from the entry
                        pubdate = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z')
                    except (AttributeError, ValueError):
                        logging.warning(f"Failed to parse date for article: {entry.title}")
                        continue

                    # Check if the article's publication date falls within the specified range
                    if self.start_date <= pubdate <= self.end_date:
                        title = entry.title
                        link = entry.link
                        description = entry.summary if hasattr(entry, 'summary') else entry.description
                        source = entry.source.title if hasattr(entry, 'source') and hasattr(entry.source, 'title') else 'Unknown'
                        articles.append({
                            'Title': title,
                            'Link': link,
                            'Description': description,
                            'Published': pubdate,
                            'Source': source
                        })
            else:
                logging.info(f"No articles found for date: {current_date.strftime('%Y-%m-%d')}")

            current_date += timedelta(days=1)

        return articles

# Function to fetch articles for multiple queries
def fetch_articles(queries, start_date, end_date, language):
    all_articles = []

    for query in queries:
        logging.info(f"Fetching news for query: {query.strip()}")
        scraper = GoogleNewsFeedScraper(query.strip(), start_date, end_date, language)
        articles = scraper.scrape_google_news_feed()
        all_articles.extend(articles)
        logging.info(f"Fetched {len(articles)} articles for query: {query.strip()}")
        logging.info("="*80)

    return all_articles

# User inputs
keywords = input("Enter keywords (comma separated for multiple): ").split(',')
start_date_str = input("Enter start date (YYYY-MM-DD): ")
end_date_str = input("Enter end date (YYYY-MM-DD): ")
language = input("Enter language (e.g., en for English, ar for Arabic): ")

# Convert the date strings to datetime objects
try:
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
except ValueError as e:
    print(f"Error: {e}. Please ensure dates are in the format YYYY-MM-DD.")
    raise

# Fetch and display the news articles
all_articles = fetch_articles(keywords, start_date, end_date, language)

# Convert the list of articles to a DataFrame and remove duplicates
df = pd.DataFrame(all_articles).drop_duplicates(subset=['Title', 'Link'])

Enter keywords (comma separated for multiple): cricket
Enter start date (YYYY-MM-DD): 2024-10-19
Enter end date (YYYY-MM-DD): 2024-10-20
Enter language (e.g., en for English, ar for Arabic): en


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Title        7 non-null      object        
 1   Link         7 non-null      object        
 2   Description  7 non-null      object        
 3   Published    7 non-null      datetime64[ns]
 4   Source       7 non-null      object        
dtypes: datetime64[ns](1), object(4)
memory usage: 336.0+ bytes


In [44]:
df.head(5)

Unnamed: 0,Title,Link,Description,Published,Source
0,PCB Selector Reveals Chat With Gautam Gambhir ...,https://news.google.com/rss/articles/CBMiuwFBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-19 03:13:05,NDTV Sports
1,Test cricket: Rishabh Pant has a history of ge...,https://news.google.com/rss/articles/CBMijwJBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-19 11:02:06,Mint
2,IND-A vs PAK-A Live Cricket Streaming: When an...,https://news.google.com/rss/articles/CBMi0AFBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-19 03:05:52,The Indian Express
3,Northamptonshire county cricket needs 'voice' ...,https://news.google.com/rss/articles/CBMiWkFVX...,"<a href=""https://news.google.com/rss/articles/...",2024-10-19 14:06:33,BBC.com
4,India v New Zealand: Tourists need 107 to win ...,https://news.google.com/rss/articles/CBMiZkFVX...,"<a href=""https://news.google.com/rss/articles/...",2024-10-19 12:18:00,BBC.com


In [45]:
df.to_excel('data.xlsx', index=False)  # Save the dataframe as an Excel file named 'data.xlsx' and exclude the index column

In [46]:
!pip install selenium



In [47]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

def get_final_redirected_url(google_news_url):
    # Setup Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Point to the location of Chromium in Colab
    chrome_options.binary_location = '/usr/bin/chromium-browser'

    # Initialize the Chrome webdriver
    driver = webdriver.Chrome(options=chrome_options)

    try:
        # Use Selenium to open the page
        driver.get(google_news_url)
        time.sleep(2)  # Wait for redirection to complete

        # Retrieve the redirected URL
        final_url = driver.current_url
        return final_url
    finally:
        # Close the browser
        driver.quit()

# Example usage
google_news_url = "https://news.google.com/rss/articles/CBMijwJBVV95cUxNS2UzQk5PZWd0d1BnbnY5VFNyOXdVYjRhS1JocnVmMTlDNHZseFZUM2JOZ2d6VnNmWVlhaFBSN1M3SHY0ckx2c2dqTy1DblpoSUdpVjN6dUJKV2xEc0xvVHp0MlhBR0dsdjlhNHFhNDgzVldoNjQxeU41alJtQVdDQUlSZTdqZWE5Wm5WSzh2QmxaVUtYTWtmdjlRX3IwMzBHVWdmUmFvc2dLci05NzhOazRvX1RHSkY3bDNRV2lqNUhfZzhNOEtwQXhqVG5DRFNLSGZwMmR0cFkzUlFOV21BTk42REVyRlNBTWNqZzZONmdSckR4XzFMd3FXS3pWalViU21wcnd0RUtrS2VOY3Y40gGUAkFVX3lxTE5ZMVlBbURFcWFJNFdJSHkyLWhjaFRheGhpMmJ5VU16dkM1cm5EUFRDN1Y2Sl9UWFFnNXJmVUNSYWtIRXhxdjAzWmw1X21TZlJIRnVFSG5JdGZwYlAzOGwzS1A1RTJwRmFnY3ZXZWE2eHAyblRVVmg3N3FsNUxBSElfa2pJSjRnVDd0UmpncGJJOERJTV9pWmZRRE1EMm1UaUZZT3RUU0Y4aml5QzFiV29rQVVxOGNwSzdIbDF4a0NBZlZ4b3FwV1VlR0ZwRFF2THNQZHZwaUlfNkdXTGFQZEYwQ2ZiY0o1T2NaaDByaTdPR3FKYXcyZU5MbmI4aVJlazVhU2tGa0UwTUZEOW5hdmtFV1JaUA?oc=5"
final_redirected_url = get_final_redirected_url(google_news_url)
print(f"Final redirected URL: {final_redirected_url}")





Final redirected URL: https://www.livemint.com/sports/cricket-news/test-cricket-rishabh-pant-has-a-history-of-getting-out-in-his-90s-joins-dhoni-ganguly-against-new-zealand-in-bengalur-11729335235497.html


In [48]:
import requests

def scrape_all_paragraphs(article_url):
    try:

        response = requests.get(article_url, timeout=60)
        response.raise_for_status()  # Check if the request was successful
        print(response)
        logging.info(f"Scraping content from: {article_url} - Status Code: {response.status_code}")
        logging.debug(response.text)  # Log the raw HTML response
        # Parse the HTML
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all <p> elements from the page
        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
        logging.info(f"Extracted {len(paragraphs)} paragraphs from {article_url}")


        # Join all paragraphs into a single string with proper spacing
        full_text = ' '.join(paragraphs).strip()  # Remove leading/trailing whitespace

        # Output the results as a single paragraph
        return full_text

    except requests.exceptions.HTTPError as http_err:
        logging.warning(f"HTTP error occurred: {http_err}")
        return {"error": "Failed to retrieve content due to HTTP error"}
    except Exception as e:
        logging.warning(f"Failed to scrape article: {article_url}. Error: {e}")
        return {"error": "Failed to retrieve content"}

# Example usage
article_url = "https://www.livemint.com/sports/cricket-news/test-cricket-rishabh-pant-has-a-history-of-getting-out-in-his-90s-joins-dhoni-ganguly-against-new-zealand-in-bengalur-11729335235497.html"
full_paragraph = scrape_all_paragraphs(article_url)
print(full_paragraph)  # Print or log the full paragraph


<Response [200]>
Rishabh Pant, the explosive Indian wicketkeeper-batsman, has showcased remarkable performances in Test cricket, with several standout innings. However, one notable aspect of his career has been his unfortunate dismissals in the 90s on multiple occasions in Testcricket, just short of converting good innings into centuries. Pant's highest score in Test cricket is an unbeaten 159* against Australia at the Sydney Cricket Ground on January 3, 2019. His innings, spanning 189 balls, included 15 boundaries and 1 six, with a strike rate of 84.12, highlighting his ability to build long innings while maintaining a brisk pace. Another spectacular knock came on July 1, 2022, when he scored 146 off 111 balls against England in Birmingham, with 19 fours and 4 sixes, a powerful display of aggressive batting. Despite these high scores, Pant has often fallen agonisingly short of the three-figure mark. He has been dismissed in the 90s multiple times, highlighting a recurring theme in his

In [49]:

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the Google News Feed Scraper class
class GoogleNewsFeedScraper:
    def __init__(self, query, start_date, end_date, language):
        self.query = query
        self.start_date = start_date  # Already a datetime object
        self.end_date = end_date      # Already a datetime object
        self.language = language

    def scrape_google_news_feed(self):
        articles = []
        current_date = self.start_date

        while current_date <= self.end_date:
            encoded_query = quote(self.query)
            rss_url = f'https://news.google.com/rss/search?q={encoded_query}&hl={self.language}&gl=US&ceid=US:{self.language[:2]}'
            feed = feedparser.parse(rss_url)

            if feed.entries:
                for entry in feed.entries:
                    try:
                        # Try to parse the published date from the entry
                        pubdate = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z')
                    except (AttributeError, ValueError):
                        logging.warning(f"Failed to parse date for article: {entry.title}")
                        continue

                    # Check if the article's publication date falls within the specified range
                    if self.start_date <= pubdate <= self.end_date:
                        title = entry.title
                        link = entry.link
                        link=get_final_redirected_url(link)
                        print(link)
                        description = entry.description if hasattr(entry, 'description')  else 'No description available'

                        content = scrape_all_paragraphs(link)

                        print(content)
                        source = entry.source.title if hasattr(entry, 'source') and hasattr(entry.source, 'title') else 'Unknown'
                        articles.append({
                            'Title': title,
                            'Link': link,
                            'Description': description,
                              'content':content,
                            'Published': pubdate,
                            'Source': source

                        })
            else:
                logging.info(f"No articles found for date: {current_date.strftime('%Y-%m-%d')}")

            current_date += timedelta(days=1)

        return articles

# Function to fetch articles for multiple queries
def fetch_articles(queries, start_date, end_date, language):
    all_articles = []

    for query in queries:
        logging.info(f"Fetching news for query: {query.strip()}")
        scraper = GoogleNewsFeedScraper(query.strip(), start_date, end_date, language)
        articles = scraper.scrape_google_news_feed()
        all_articles.extend(articles)
        logging.info(f"Fetched {len(articles)} articles for query: {query.strip()}")
        logging.info("="*80)

    return all_articles
# User inputs
keywords = input("Enter keywords (comma separated for multiple): ").split(',')
start_date_str = input("Enter start date (YYYY-MM-DD): ")
end_date_str = input("Enter end date (YYYY-MM-DD): ")
language = input("Enter language (e.g., en for English, ar for Arabic): ")

# Convert the date strings to datetime objects
try:
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
except ValueError as e:
    print(f"Error: {e}. Please ensure dates are in the format YYYY-MM-DD.")
    raise

# Fetch and display the news articles
all_articles = fetch_articles(keywords, start_date, end_date, language)

# Convert the list of articles to a DataFrame and remove duplicates also a which content in empty
df = pd.DataFrame(all_articles).drop_duplicates(subset=['Title', 'Link'])
#remove if content had http error
df = df[df['content'] != '{"error": "Failed to retrieve content due to HTTP error"}']

Enter keywords (comma separated for multiple): cricket
Enter start date (YYYY-MM-DD): 2024-10-19
Enter end date (YYYY-MM-DD): 2024-10-20
Enter language (e.g., en for English, ar for Arabic): en
https://sports.ndtv.com/cricket/pcb-selector-reveals-chat-with-gautam-gambhir-over-pakistan-cricket-feels-sorry-6820414
<Response [200]>
Pakistan won a Test on home soil after a gap of more three years on Friday, beating England by 152 runs in the second match in Multan to level the three-game series 1-1. Pakistan's new selection committee made some unexpected changes to the squad for the last two Tests, dropping the likes of star batterBabar Azamand pacerShaheen Afridi. The decision received mixed response from fans of Pakistan cricket. While some suggested that dropping out-of-form batter Babar was a mistake, others hailed the new selection committee for making the right call. Former Pakistan pacerAaqib Javed, who is part of the new selection committee, has now opened up on his chat with India



https://indianexpress.com/article/sports/cricket/india-vs-pakistan-t20-acc-emerging-teams-asia-cup-2024-live-cricket-streaming-9627912/
{'error': 'Failed to retrieve content due to HTTP error'}
https://www.bbc.com/news/articles/cn4z91nm79eo
<Response [200]>
A new county cricket club chairman said they must have a "voice" in the structure of cricket, including the sale of the Hundred. The England and Wales Cricket Board (ECB) has begun the process of selling stakes in the eight franchises. Gary Hoffman took over at Northamptonshire earlier this month, having previously been heavily involved in football. "We have an ownership interest in the Hundred, so we're acutely interested in what is happening in the sale process," he said. The ECB is sellinga 49% stake in each of the eight teams, some of which belongs to the counties, with the other 51% then given to the hosts of those teams, who can opt to keep the stake, sell all or part of it. Mr Hoffman said Northamptonshire needed "to have the



https://www.hindustantimes.com/entertainment/tamil-cinema/from-chennai-28-to-lubber-pandhu-why-tamil-films-on-cricket-always-hit-it-out-of-the-park-101729318017887.html
{'error': 'Failed to retrieve content due to HTTP error'}
https://www.livemint.com/sports/cricket-news/test-cricket-rishabh-pant-has-a-history-of-getting-out-in-his-90s-joins-dhoni-ganguly-against-new-zealand-in-bengalur-11729335235497.html
<Response [200]>
Rishabh Pant, the explosive Indian wicketkeeper-batsman, has showcased remarkable performances in Test cricket, with several standout innings. However, one notable aspect of his career has been his unfortunate dismissals in the 90s on multiple occasions in Testcricket, just short of converting good innings into centuries. Pant's highest score in Test cricket is an unbeaten 159* against Australia at the Sydney Cricket Ground on January 3, 2019. His innings, spanning 189 balls, included 15 boundaries and 1 six, with a strike rate of 84.12, highlighting his ability to b



https://indianexpress.com/article/sports/cricket/india-vs-pakistan-t20-acc-emerging-teams-asia-cup-2024-live-cricket-streaming-9627912/
{'error': 'Failed to retrieve content due to HTTP error'}




https://www.hindustantimes.com/entertainment/tamil-cinema/from-chennai-28-to-lubber-pandhu-why-tamil-films-on-cricket-always-hit-it-out-of-the-park-101729318017887.html
{'error': 'Failed to retrieve content due to HTTP error'}


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Title        7 non-null      object        
 1   Link         7 non-null      object        
 2   Description  7 non-null      object        
 3   content      7 non-null      object        
 4   Published    7 non-null      datetime64[ns]
 5   Source       7 non-null      object        
dtypes: datetime64[ns](1), object(5)
memory usage: 392.0+ bytes


In [51]:
df.head(5)

Unnamed: 0,Title,Link,Description,content,Published,Source
0,PCB Selector Reveals Chat With Gautam Gambhir ...,https://sports.ndtv.com/cricket/pcb-selector-r...,"<a href=""https://news.google.com/rss/articles/...",Pakistan won a Test on home soil after a gap o...,2024-10-19 03:13:05,NDTV Sports
1,Test cricket: Rishabh Pant has a history of ge...,https://www.livemint.com/sports/cricket-news/t...,"<a href=""https://news.google.com/rss/articles/...","Rishabh Pant, the explosive Indian wicketkeepe...",2024-10-19 11:02:06,Mint
2,IND-A vs PAK-A Live Cricket Streaming: When an...,https://indianexpress.com/article/sports/crick...,"<a href=""https://news.google.com/rss/articles/...",{'error': 'Failed to retrieve content due to H...,2024-10-19 03:05:52,The Indian Express
3,Northamptonshire county cricket needs 'voice' ...,https://www.bbc.com/news/articles/cn4z91nm79eo,"<a href=""https://news.google.com/rss/articles/...",A new county cricket club chairman said they m...,2024-10-19 14:06:33,BBC.com
4,India v New Zealand: Tourists need 107 to win ...,https://www.bbc.com/sport/cricket/articles/cvg...,"<a href=""https://news.google.com/rss/articles/...",New Zealand have lost 10 Tests and drawn nine ...,2024-10-19 12:18:00,BBC.com


summary of content

In [55]:
!pip install langchain



In [53]:
import google.generativeai as genai

from langchain.text_splitter import RecursiveCharacterTextSplitter
genai.configure(api_key="GEMINI_API_KEY")
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

In [54]:
apikey=input("Enter your api key")
genai.configure(api_key=apikey)

Enter your api keyAIzaSyBy0EOeiIQQpVOwWmberi28ecCLPBuERuM


In [56]:
def gemini_generate_response(prompt):
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        generation_config=generation_config,
    )

    chat_session = model.start_chat(
        history=[
            {"role": "user", "parts": [prompt]},
        ]
    )

    response = chat_session.send_message(prompt)
    return response.text

In [57]:
# 3. content summary

def content_summary(content):
    prompt = f'''
    Provide a detailed summarization of the content below:

    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    {content}
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
    '''
    return gemini_generate_response(prompt)

In [58]:
#update the df content column with a new column as content summary
df['content_summary'] = df['content'].apply(content_summary)


In [59]:
df.head(5)

Unnamed: 0,Title,Link,Description,content,Published,Source,content_summary
0,PCB Selector Reveals Chat With Gautam Gambhir ...,https://sports.ndtv.com/cricket/pcb-selector-r...,"<a href=""https://news.google.com/rss/articles/...",Pakistan won a Test on home soil after a gap o...,2024-10-19 03:13:05,NDTV Sports,This news article reports on Pakistan's victor...
1,Test cricket: Rishabh Pant has a history of ge...,https://www.livemint.com/sports/cricket-news/t...,"<a href=""https://news.google.com/rss/articles/...","Rishabh Pant, the explosive Indian wicketkeepe...",2024-10-19 11:02:06,Mint,The text focuses on Indian cricketer Rishabh P...
2,IND-A vs PAK-A Live Cricket Streaming: When an...,https://indianexpress.com/article/sports/crick...,"<a href=""https://news.google.com/rss/articles/...",{'error': 'Failed to retrieve content due to H...,2024-10-19 03:05:52,The Indian Express,The provided content indicates an error occurr...
3,Northamptonshire county cricket needs 'voice' ...,https://www.bbc.com/news/articles/cn4z91nm79eo,"<a href=""https://news.google.com/rss/articles/...",A new county cricket club chairman said they m...,2024-10-19 14:06:33,BBC.com,The new chairman of Northamptonshire County Cr...
4,India v New Zealand: Tourists need 107 to win ...,https://www.bbc.com/sport/cricket/articles/cvg...,"<a href=""https://news.google.com/rss/articles/...",New Zealand have lost 10 Tests and drawn nine ...,2024-10-19 12:18:00,BBC.com,This text describes a thrilling cricket match ...


In [60]:
!pip install transformers torch





In [61]:
import pandas as pd
from transformers import pipeline


In [62]:
sentiment_analyzer = pipeline("sentiment-analysis")

# Function to analyze sentiment using the transformers model
def analyze_sentiment(text):
    result = sentiment_analyzer(text)[0]
    return result['label'], result['score']

# Merge title and content for sentiment analysis
df['Merged_Text'] = df['Title'] + " " + df['content_summary']

# Apply sentiment analysis and create separate columns for label and score
df[['Sentiment_Label', 'Sentiment_Score']] = df['Merged_Text'].apply(analyze_sentiment).apply(pd.Series)

# Display the DataFrame with the sentiment analysis result
print(df[['Title', 'content_summary', 'Sentiment_Label', 'Sentiment_Score']])

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



                                               Title  \
0  PCB Selector Reveals Chat With Gautam Gambhir ...   
1  Test cricket: Rishabh Pant has a history of ge...   
2  IND-A vs PAK-A Live Cricket Streaming: When an...   
3  Northamptonshire county cricket needs 'voice' ...   
4  India v New Zealand: Tourists need 107 to win ...   
5  Where to watch South Africa vs. New Zealand: W...   
6  From Chennai 28 to Lubber Pandhu: Why Tamil fi...   

                                     content_summary Sentiment_Label  \
0  This news article reports on Pakistan's victor...        POSITIVE   
1  The text focuses on Indian cricketer Rishabh P...        POSITIVE   
2  The provided content indicates an error occurr...        NEGATIVE   
3  The new chairman of Northamptonshire County Cr...        POSITIVE   
4  This text describes a thrilling cricket match ...        NEGATIVE   
5  The article is an announcement for the upcomin...        POSITIVE   
6  The provided content indicates an error enco

In [63]:
df.to_excel('data_with_ml.xlsx', index=False)