In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_news(api_key, selected_date,language):
    url = 'https://newsapi.org/v2/everything'
    
    # Format selected_date to match NewsAPI's date format
    selected_date_str = datetime.strptime(selected_date, '%Y-%m-%d').strftime('%Y-%m-%d')
    
    # Define the parameters for the API request
    params = {
        'q': 'India',  # Broad keyword to fetch news
        'from': selected_date_str,  # Start date
        'to': selected_date_str,    # End date
        'apiKey': api_key,
        'language': language 
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raises HTTPError for bad responses
    except requests.RequestException as e:
        logger.error(f"Error fetching news: {e}")
        return []

    data = response.json()

    # Function to clean the description
    def clean_description(description):
        clean_text = re.sub(r'<[^>]+>', '', description)
        clean_text = re.sub(r'\[\d+\s\w+\]', '', clean_text)
        clean_text = re.sub(r'\[.*?\]', '', clean_text)
        clean_text = re.sub(r'\s+', ' ', clean_text)
        clean_text = clean_text.strip()
        return clean_text

    # Function to scrape article content
    def scrape_article_content(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        try:
            article_response = requests.get(url, headers=headers)
            article_response.raise_for_status()
            soup = BeautifulSoup(article_response.content, 'html.parser')

            article_body = soup.find('main') or soup.find('article')
            if article_body:
                paragraphs = article_body.find_all('p')
                content = ' '.join(p.get_text() for p in paragraphs)
                
                # Clean up extra spaces and newlines
                content = re.sub(r'\s+', ' ', content).strip()
                
                # Ensure content length is between 0 and 5000 characters
                if len(content) > 5000:
                    content = content[:5000]
                elif len(content) == 0:
                    return ""
                
                # Ensure content ends with a full stop if not empty
                if content and not content.endswith('.'):
                    content += '.'
                
                return content
            return ""
        except requests.RequestException:
            # Return empty string for any request-related errors
            return ""

    # Preprocess and filter the news data
    def preprocess_news(news_list):
        valid_news = []
        urls = [article.get('url', '').strip() for article in news_list]
        
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_url = {executor.submit(scrape_article_content, url): url for url in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    full_content = future.result()
                    if full_content:  # Only process if content is not empty
                        for article in news_list:
                            if article.get('url', '').strip() == url:
                                title = article.get('title', '').strip()
                                published = article.get('publishedAt', '').strip()
                                description = article.get('description', '').strip()
                                
                                cleaned_description = clean_description(description)
                                
                                if title and url and published != '1970-01-01T00:00:00Z':
                                    try:
                                        date_published = datetime.strptime(published, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
                                        if date_published == selected_date:
                                            valid_news.append({
                                                'title': title,
                                                'link': url,
                                                'date': date_published,
                                                'short_description': cleaned_description,
                                                'content': full_content
                                            })
                                    except ValueError:
                                        continue
                except Exception:
                    # If there's an issue with the future result, simply continue
                    continue

        return valid_news

    news_list = data.get('articles', [])
    filtered_news = preprocess_news(news_list)
    
    return filtered_news

# Example usage
api_key = '95bb6bf55ec742bca7cfeb768f50245f'  # Replace with your actual NewsAPI key
selected_date = '2024-10-10'
language = 'en'
data = get_news(api_key, selected_date,language)
print(data)


[{'title': "Eight-hundred-and-twenty-three! The spirit behind England's record-breaking day", 'link': 'https://www.bbc.com/sport/cricket/articles/c93p72xk13do', 'date': '2024-10-10', 'short_description': "England's record-breaking runscoring is testament to their incredible spirit, says chief cricket writer Stephan Shemilt", 'content': 'Joe Root and Harry Brook became the first two England batters to record scores in excess of 250 in the same Test innings Two days ago, England had shipped 556, had an opener with a dislocated thumb and a stand-in captain out for a duck. They were feeling the heat, literally and figuratively, in the unbearable temperatures that have accompanied the first Test against Pakistan in Multan. Questions over preparation, Ben Stokes\' hamstring and the lack of a reserve opener hung in the molten air. Why was there no fast bowler to cover for the injured Josh Hull and soon-to-be-married Olly Stone? Bowling consultant James Anderson had only just arrived from play

In [2]:
import pickle
clf_loaded_model = pickle.load(open('D:\\sem 7\\Newspaper_Extraction_Summarization_Translation\\backend\\clf_model','rb'))

In [3]:
def predict_categories(docs):
    """
    Predict the categories for a list of news articles.

    Parameters:
    docs (list): List of news articles as strings.

    Returns:
    list: List of predicted categories.
    """
    predicted = clf_loaded_model.predict(docs)
    return predicted.tolist()

In [4]:
def filter_news_by_category(docs, categories, selected_category):
    """
    Filter and display news articles based on the selected category.

    Parameters:
    docs (list): List of news articles as strings.
    categories (list): List of predicted categories corresponding to the docs.
    selected_category (str): The category to filter news articles by.

    Returns:
    list: List of news articles belonging to the selected category.
    """
    filtered_news = [doc for doc, category in zip(docs, categories) if category == selected_category]
    return filtered_news


In [5]:
from flask import Flask, request, jsonify
from threading import Thread
import spacy
from heapq import nlargest
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import pymysql
import pandas as pd
from googletrans import Translator
from flask_cors import CORS


# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

translator = Translator()

app = Flask(__name__)
CORS(app)

@app.route('/')
def home():
    return "Welcome to the News API!!!!!!"

@app.route('/api/get-news', methods=['GET'])
def fetch_news():
    api_key = request.args.get('api_key')
    selected_date = request.args.get('date')
    language = request.args.get('language', 'en')  # Default to English if not provided
    
    if not api_key or not selected_date:
        return jsonify({"error": "API key and date are required"}), 400

    try:
        # Fetch and process news data
        news_data = get_news(api_key, selected_date, language)
        
        # Save the processed news data to the database
        store_news_to_db(news_data)
        
        return jsonify(news_data)
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/api/predict-categories', methods=['GET'])
def predict_categories_endpoint():
    try:
        # Load news data from the database
        df = load_news_from_db()
        docs_new = df['content'].tolist()
        
        # Predict categories
        predicted_categories = predict_categories(docs_new)
        unique_categories = list(set(predicted_categories))
        
        return jsonify({"categories": unique_categories})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# @app.route('/api/filter_news', methods=['GET'])
# def filter_news_endpoint():
#     selected_category = request.args.get('category')
    
#     if not selected_category:
#         return jsonify({"error": "Category parameter is required"}), 400

#     try:
#         # Load news data from the database
#         df = load_news_from_db()
#         docs_new = df['content'].tolist()
#         predicted_categories = predict_categories(docs_new)
        
#         # Convert the selected category to uppercase to match your case-sensitive data
# #         selected_category = selected_category.upper()
        
#         # Filter news by the selected category (case-insensitive)
#         filtered_news = filter_news_by_category(docs_new, predicted_categories, selected_category)
        
#         return jsonify({"news": filtered_news})
#     except Exception as e:
#         return jsonify({"error": str(e)}), 500

@app.route('/api/filter_news', methods=['GET'])
def filter_news_endpoint():
    selected_category = request.args.get('category')
    
    if not selected_category:
        return jsonify({"error": "Category parameter is required"}), 400

    try:
        # Load news data from the database
        df = load_news_from_db()

        # Assuming the DataFrame has 'title', 'link', and 'content' columns
        docs_new = df['content'].tolist()
        titles = df['title'].tolist()
        links = df['link'].tolist()

        predicted_categories = predict_categories(docs_new)

        # Create a DataFrame for predicted categories
        df['predicted_category'] = predicted_categories
        
        # Filter news by the selected category (case-insensitive)
        filtered_news_df = df[df['predicted_category'].str.lower() == selected_category.lower()]

        # Construct the response with titles, links, and content
        filtered_news = []
        for _, row in filtered_news_df.iterrows():
            filtered_news.append({
                "title": row['title'],
                "link": row['link'],
                "content": row['content']  # Include content if needed
            })
        
        return jsonify({"news": filtered_news})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/api/summarize-news', methods=['GET'])
def summarize_news():
    title = request.args.get('title')
    
    if not title:
        return jsonify({"error": "Title parameter is required"}), 400

    try:
        # Load news data from the database
        df = load_news_from_db()
        selected_news = df[df['title'] == title]

        if selected_news.empty:
            return jsonify({"error": "News article not found"}), 404

        content = selected_news['content'].values[0]

        # Summarize the content
        doc = nlp(content)
        stopwords = list(STOP_WORDS)
        
        word_frequencies = {}
        for word in doc:
            if word.text.lower() not in stopwords and word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

        max_frequency = max(word_frequencies.values())
        for word in word_frequencies.keys():
            word_frequencies[word] = word_frequencies[word] / max_frequency

        sentence_tokens = [sent for sent in doc.sents]
        sentence_scores = {}
        for sent in sentence_tokens:
            for word in sent:
                if word.text.lower() in word_frequencies.keys():
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

        select_length = int(len(sentence_tokens) * 0.3)
        summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get)
        final_summary = [sent.text for sent in summary_sentences]
        summary = ' '.join(final_summary)
        
        insert_summary_into_db(title,summary)

        return jsonify({"summary": summary})
    except Exception as e:
        return jsonify({"error": str(e)}), 500
    
    
@app.route('/api/translate-news', methods=['GET'])
def translate_news():
    title = request.args.get('title')
    target_language = request.args.get('language', 'hi')  # Default to Hindi if no language is provided
    use_summarized = request.args.get('summarized', 'false').lower() == 'true'  # Check if summarized content should be used

    if not title:
        return jsonify({"error": "Title parameter is required"}), 400

    try:
        # Load news data from the database
        if use_summarized:
            # Load summary data from the summary table
            summary_df = load_summary_from_db()
            selected_summary = summary_df[summary_df['title'] == title]

            if selected_summary.empty:
                return jsonify({"error": "Summary not found for the specified title"}), 404

            # Fetch the summarized content
            content = selected_summary['summary'].values[0]  # Use summarized content
        else:
            # Load the original news data from the news table
            news_df = load_news_from_db()
            selected_news = news_df[news_df['title'] == title]

            if selected_news.empty:
                return jsonify({"error": "News article not found"}), 404

            # Fetch the original content
            content = selected_news['content'].values[0]  # Use original content

        # Ensure content is valid and not None or empty
        if not content:
            return jsonify({"error": "Content for the selected news article is empty"}), 404

        # Translate the content to the desired language
        try:
            translated = translator.translate(content, dest=target_language)
        except Exception as translation_error:
            return jsonify({"error": f"Translation failed: {str(translation_error)}"}), 500

        return jsonify({"translated_content": translated.text})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

def store_news_to_db(data):
    df = pd.DataFrame(data)
    if df.empty:
        print("No data to store.")
        return

    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )

    try:
        with connection.cursor() as cursor:
            cursor.execute("TRUNCATE TABLE news_details")
            for _, row in df.iterrows():
                sql = """
                    INSERT INTO news_details (title, link, date, short_description, content)
                    VALUES (%s, %s, %s, %s, %s)
                """
                cursor.execute(sql, (row['title'], row['link'], row['date'], row['short_description'], row['content']))
            connection.commit()
            print("Data successfully stored in the database.")
    except Exception as e:
        print(f"Error storing data to database: {e}")
    finally:
        connection.close()

def load_news_from_db():
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )
    
    try:
        query = "SELECT * FROM news_details"
        df = pd.read_sql(query, connection)
        return df
    finally:
        connection.close()

        
def insert_summary_into_db(title, summarized_content):
    """Insert the title and summarized content into the 'news_summ' table."""
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )
    
    try:
        with connection.cursor() as cursor:
            sql = """
                INSERT INTO news_table (title, summary)
                VALUES (%s, %s)
            """
            cursor.execute(sql, (title, summarized_content))
            connection.commit()
            print(f"Summarized content stored for title: {title}")
    except Exception as e:
        print(f"Error storing summarized content: {e}")
    finally:
        connection.close()
        
def load_summary_from_db():
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )
    
    try:
        query = "SELECT title, summary FROM news_table"  # Assuming your summary table is named 'news_summ'
        df = pd.read_sql(query, connection)
        return df
    finally:
        connection.close()

def run_flask_app():
    app.run(debug=True, use_reloader=False)

# Run the app in a separate thread so Jupyter doesn't block
flask_thread = Thread(target=run_flask_app)
flask_thread.start()

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
  df = pd.read_sql(query, connection)
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:49:04] "[33mGET /api/translate-news?title=Can%20India’s%20version%20of%20The%20Onion%20beat%20hate%20with%20laughter?&language=hi&summarized=false HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:49:22] "[35m[1mGET /api/translate-news?title=India%20furious%20as%20diplomats%20declared%20‘persons%20of%20interest’%20in%20Canada%20probe&language=hi&summarized=false HTTP/1.1[0m" 500 -
ERROR:__main__:Error fetching news: 426 Client Error: Upgrade Required for url: https://newsapi.org/v2/everything?q=India&from=2024-09-12&to=2024-09-12&apiKey=41817dbc42cb43eba4fc5899666f1061&language=en
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:49:53] "GET /api/get-news?api_key=41817dbc42cb43eba4fc5899666f1061&date=2024-09-12&language=en HTTP/1.1" 200 -


No data to store.


INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:50:57] "GET /api/get-news?api_key=41817dbc42cb43eba4fc5899666f1061&date=2024-10-10&language=en HTTP/1.1" 200 -


Data successfully stored in the database.


  df = pd.read_sql(query, connection)
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:51:08] "GET /api/filter_news?category=ENVIRONMENT HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:51:15] "[31m[1mGET /api/filter_news HTTP/1.1[0m" 400 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:51:27] "GET /api/predict-categories HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:51:34] "GET /api/filter_news?category=ENVIRONMENT HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:51:40] "GET /api/filter_news?category=TRAVEL HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:51:59] "[33mGET /api/translate-news?title=Which%20teams%20can%20qualify%20for%20the%20Women\\u2019s%20T20%20World%20Cup%20semifinals%20and%20how?&language=hi&summarized=false HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [16/Oct/2024 08:52:31] "[35m[1mGET /api/translate-news?title=Why%20is%20pro-China%20Maldives%20leader%20Muizzu%20seeking%20to%20mend%20India%20ties?&language=hi&summariz

In [5]:
from flask import Flask, request, jsonify
from threading import Thread
import spacy
from heapq import nlargest
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import pymysql
import pandas as pd
import import_ipynb
from googletrans import Translator

from flask_cors import CORS

import nltk

nltk.download('punkt')

# Initialize spaCy
nlp = spacy.load("en_core_web_sm")
translator = Translator()

app = Flask(__name__)
CORS(app)

@app.route('/')
def home():
    return "Welcome to the News API........"

@app.route('/api/get-news', methods=['GET'])
def fetch_news():
    api_key = request.args.get('api_key')
    selected_date = request.args.get('date')
    language = request.args.get('language', 'en')  # Default to English if not provided
    
    if not api_key or not selected_date:
        return jsonify({"error": "API key and date are required"}), 400

    try:
        # Fetch and process news data
        news_data = get_news(api_key, selected_date, language)
        
        # Save the processed news data to the database
        store_news_to_db(news_data)
        
        return jsonify(news_data)
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/api/predict-categories', methods=['GET'])
def predict_categories_endpoint():
    try:
        # Load news data from the database
        df = load_news_from_db()
        docs_new = df['content'].tolist()
        
        # Predict categories
        predicted_categories = predict_categories(docs_new)
        unique_categories = list(set(predicted_categories))
        
        return jsonify({"categories": unique_categories})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# @app.route('/api/filter_news', methods=['GET'])
# def filter_news_endpoint():
#     selected_category = request.args.get('category')
    
#     if not selected_category:
#         return jsonify({"error": "Category parameter is required"}), 400

#     try:
#         # Load news data from the database
#         df = load_news_from_db()
#         docs_new = df['content'].tolist()
#         predicted_categories = predict_categories(docs_new)
        
#         # Convert the selected category to uppercase to match your case-sensitive data
# #         selected_category = selected_category.upper()
        
#         # Filter news by the selected category (case-insensitive)
#         filtered_news = filter_news_by_category(docs_new, predicted_categories, selected_category)
        
#         return jsonify({"news": filtered_news})
#     except Exception as e:
#         return jsonify({"error": str(e)}), 500

@app.route('/api/filter_news', methods=['GET'])
def filter_news_endpoint():
    selected_category = request.args.get('category')
    
    if not selected_category:
        return jsonify({"error": "Category parameter is required"}), 400

    try:
        # Load news data from the database
        df = load_news_from_db()

        # Assuming the DataFrame has 'title', 'link', and 'content' columns
        docs_new = df['content'].tolist()
        titles = df['title'].tolist()
        links = df['link'].tolist()

        predicted_categories = predict_categories(docs_new)

        # Create a DataFrame for predicted categories
        df['predicted_category'] = predicted_categories
        
        # Filter news by the selected category (case-insensitive)
        filtered_news_df = df[df['predicted_category'].str.lower() == selected_category.lower()]

        # Construct the response with titles, links, and content
        filtered_news = []
        for _, row in filtered_news_df.iterrows():
            filtered_news.append({
                "title": row['title'],
                "link": row['link'],
                "content": row['content']  # Include content if needed
            })
        
        return jsonify({"news": filtered_news})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/api/summarize-news', methods=['GET'])
def summarize_news():
    title = request.args.get('title')
    
    if not title:
        return jsonify({"error": "Title parameter is required"}), 400

    try:
        # Load news data from the database
        df = load_news_from_db()
        selected_news = df[df['title'] == title]

        if selected_news.empty:
            return jsonify({"error": "News article not found"}), 404

        content = selected_news['content'].values[0]

        # Summarize the content
        doc = nlp(content)
        stopwords = list(STOP_WORDS)
        
        word_frequencies = {}
        for word in doc:
            if word.text.lower() not in stopwords and word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

        max_frequency = max(word_frequencies.values())
        for word in word_frequencies.keys():
            word_frequencies[word] = word_frequencies[word] / max_frequency

        sentence_tokens = [sent for sent in doc.sents]
        sentence_scores = {}
        for sent in sentence_tokens:
            for word in sent:
                if word.text.lower() in word_frequencies.keys():
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

        select_length = int(len(sentence_tokens) * 0.3)
        summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get)
        final_summary = [sent.text for sent in summary_sentences]
        summary = ' '.join(final_summary)
        
        insert_summary_into_db(title,summary)

        return jsonify({"summary": summary})
    except Exception as e:
        return jsonify({"error": str(e)}), 500
    
    
@app.route('/api/translate-news', methods=['GET'])
def translate_news():
    title = request.args.get('title')
    target_language = request.args.get('language', 'hi')  # Default to Hindi
    use_summarized = request.args.get('summarized', 'false').lower() == 'true'

    if not title:
        return jsonify({"error": "Title parameter is required"}), 400

    try:
        # Load news data from the database
        if use_summarized:
            summary_df = load_summary_from_db()
            selected_summary = summary_df[summary_df['title'] == title]
            if selected_summary.empty:
                return jsonify({"error": "Summary not found for the specified title"}), 404
            content = selected_summary['summary'].values[0]
        else:
            news_df = load_news_from_db()
            selected_news = news_df[news_df['title'] == title]
            if selected_news.empty:
                return jsonify({"error": "News article not found"}), 404
            content = selected_news['content'].values[0]

        if not isinstance(content, str) or not content.strip():
            return jsonify({"error": "Content for the selected news article is empty or invalid"}), 404

        # Check word count
        word_count = len(content.split())

        # If content has more than 400 words, tokenize and translate sentence by sentence
        if word_count > 400:
            sentences = nltk.sent_tokenize(content)
            translated_sentences = []

            for sentence in sentences:
                translated = translator.translate(sentence, dest=target_language)
                translated_sentences.append(translated.text)

            translated_content = ' '.join(translated_sentences)
        else:
            # Direct translation for shorter content
            translated_content = translator.translate(content, dest=target_language).text

        return jsonify({"translated_content": translated_content})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

def store_news_to_db(data):
    df = pd.DataFrame(data)
    if df.empty:
        print("No data to store.")
        return

    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )

    try:
        with connection.cursor() as cursor:
            cursor.execute("TRUNCATE TABLE news_details")
            for _, row in df.iterrows():
                sql = """
                    INSERT INTO news_details (title, link, date, short_description, content)
                    VALUES (%s, %s, %s, %s, %s)
                """
                cursor.execute(sql, (row['title'], row['link'], row['date'], row['short_description'], row['content']))
            connection.commit()
            print("Data successfully stored in the database.")
    except Exception as e:
        print(f"Error storing data to database: {e}")
    finally:
        connection.close()

def load_news_from_db():
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )
    
    try:
        query = "SELECT * FROM news_details"
        df = pd.read_sql(query, connection)
        return df
    finally:
        connection.close()

        
def insert_summary_into_db(title, summarized_content):
    """Insert the title and summarized content into the 'news_summ' table."""
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )
    
    try:
        with connection.cursor() as cursor:
            sql = """
                INSERT INTO news_table (title, summary)
                VALUES (%s, %s)
            """
            cursor.execute(sql, (title, summarized_content))
            connection.commit()
            print(f"Summarized content stored for title: {title}")
    except Exception as e:
        print(f"Error storing summarized content: {e}")
    finally:
        connection.close()
        
def load_summary_from_db():
    connection = pymysql.connect(
        host='localhost',
        user='root',
        password='mysqlpassword',
        database='newsdb'
    )
    
    try:
        query = "SELECT title, summary FROM news_table"  # Assuming your summary table is named 'news_summ'
        df = pd.read_sql(query, connection)
        return df
    finally:
        connection.close()

def run_flask_app():
    app.run(debug=True, use_reloader=False)

# Run the app in a separate thread so Jupyter doesn't block
flask_thread = Thread(target=run_flask_app)
flask_thread.start()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hetvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
  df = pd.read_sql(query, connection)
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:20:22] "GET /api/predict-categories HTTP/1.1" 200 -
ERROR:__main__:Error fetching news: 426 Client Error: Upgrade Required for url: https://newsapi.org/v2/everything?q=India&from=2024-09-19&to=2024-09-19&apiKey=41817dbc42cb43eba4fc5899666f1061&language=en
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:20:22] "GET /api/get-news?api_key=41817dbc42cb43eba4fc5899666f1061&date=2024-09-19&language=en HTTP/1.1" 200 -


No data to store.


ERROR:__main__:Error fetching news: 426 Client Error: Upgrade Required for url: https://newsapi.org/v2/everything?q=India&from=2024-09-19&to=2024-09-19&apiKey=41817dbc42cb43eba4fc5899666f1061&language=en
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:20:23] "GET /api/get-news?api_key=41817dbc42cb43eba4fc5899666f1061&date=2024-09-19&language=en HTTP/1.1" 200 -


No data to store.


INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:14] "GET /api/get-news?api_key=95bb6bf55ec742bca7cfeb768f50245f&date=2024-10-22&language=en HTTP/1.1" 200 -
  df = pd.read_sql(query, connection)
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:14] "GET /api/predict-categories HTTP/1.1" 200 -


Data successfully stored in the database.


INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:22] "GET /api/filter_news?category=CRIME HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:23] "GET /api/filter_news?category=CRIME HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:28] "GET /api/filter_news?category=SPORTS HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:31] "GET /api/filter_news?category=SCIENCE HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:32] "GET /api/get-news?api_key=95bb6bf55ec742bca7cfeb768f50245f&date=2024-10-22&language=en HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [23/Oct/2024 10:21:32] "GET /api/predict-categories HTTP/1.1" 200 -


Data successfully stored in the database.
