<a href="https://colab.research.google.com/github/PavithraArjunan/Article_webscraping_NLP_model/blob/main/backend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyngrok

In [None]:
!ngrok config add-authtoken 2ufqs6Iuiw1tlCUd5HPma4X2rYJ_2Fv2BUzm2Lo36Pe9uWAM2

In [None]:
pip install gtts

In [None]:
pip install deep_translator

In [None]:
pip install flask_cors

In [None]:
from flask import Flask, jsonify, request
from pyngrok import ngrok
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from collections import defaultdict
import json
import spacy
from gtts import gTTS
from deep_translator import GoogleTranslator
import os

# Load spaCy NLP model for topic extraction
nlp = spacy.load("en_core_web_sm")

# Flask app initialization
app = Flask(__name__)

# List of URLs to scrape
urls = [
    "https://apnews.com/article/tesla-sales-2024-drop-electric-vehicles-69af17c4e606625694af8293db25b2f3",
    "https://www.theweek.in/news/biz-tech/2025/02/25/without-low-cost-model-teslas-potential-entry-to-india-unlikely-to-hurt-indian-car-makers-say-analysts.html",
    "https://apnews.com/article/cybertruck-recall-tesla-elon-musk-nhtsa-8c517e21aa1119d74b9db39f6aca01b7",
    "https://www.news18.com/business/andhra-pradesh-makes-pitch-again-to-attract-teslas-manufacturing-plant-report-9236626.html",
    "https://www.motorzest.com/2015/03/mahindra-two-wheeler-sales-down-537-in.html",
    "https://www.prnewswire.com/in/news-releases/mahindra-first-choice-wheels-mfcwl-raises-15-million-from-san-francisco-based-valiant-capital-497010601.html",
    "https://www.rediff.com/business/report/tata-mahindra-will-not-allow-tesla-to-dominate/20250228.htm",
    "https://www.etnownews.com/companies/tata-motors-board-meeting-result-big-announcement-ahead-of-demerger-2-lakh-ncds-of-rs-100000-face-value-article-119216849",
    "https://www.pbs.org/newshour/nation/nissan-and-honda-to-attempt-a-merger-that-would-create-the-worlds-no-3-automaker",
    "https://www.indiainfoline.com/news/markets/tata-motors-and-mahindra-bet-big-on-electric-suvs-aiming-to-challenge-global-players"
]

headers = {"User-Agent": "Mozilla/5.0"}

# Company Keywords Mapping
company_keywords = {
    "Tesla": ["tesla", "elon musk", "cybertruck"],
    "Maruti Suzuki": ["maruti", "suzuki"],
    "Mahindra": ["mahindra", "xuv700", "thar"],
    "Tata Motors": ["tata", "jaguar", "land rover"],
    "Nissan-Honda": ["nissan", "honda"]
}

def analyze_sentiment(text):
    """Perform sentiment analysis and return polarity label."""
    score = TextBlob(text).sentiment.polarity
    if score > 0.5:
        return "Very Positive"
    elif score > 0:
        return "Positive"
    elif score == 0:
        return "Neutral"
    elif score > -0.5:
        return "Negative"
    else:
        return "Very Negative"

def identify_company(title, summary):
    """Identify company based on keywords in the title/summary."""
    combined_text = f"{title} {summary}".lower()
    for company, keywords in company_keywords.items():
        if any(keyword in combined_text for keyword in keywords):
            return company
    return "Other"

def extract_topics(summary):
    """Extract relevant topics dynamically using NLP."""
    doc = nlp(summary)
    topics = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "EVENT"]]
    return list(set(topics))  # Remove duplicates

def scrape_article(url):
    """Scrape article title, summary, and analyze sentiment."""
    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"❌ Failed to fetch {url}")
            return None

        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.find("title").text.strip() if soup.find("title") else "No title found"
        summary_tag = soup.find("meta", attrs={"name": "description"})
        summary = summary_tag["content"].strip() if summary_tag else "No summary found"
        sentiment = analyze_sentiment(summary)
        company = identify_company(title, summary)
        topics = extract_topics(summary)

        return {"Title": title, "Summary": summary, "Sentiment": sentiment, "Topics": topics, "Company": company}
    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")
        return None

# Scrape all URLs
scraped_articles = [scrape_article(url) for url in urls]
articles = [article for article in scraped_articles if article]

# Organize articles by company
company_data = defaultdict(lambda: {"Articles": [], "Sentiment Distribution": defaultdict(int)})

for article in articles:
    company = article["Company"]
    company_data[company]["Articles"].append(article)
    company_data[company]["Sentiment Distribution"][article["Sentiment"]] += 1

# Generate Comparative Sentiment Analysis
for company, data in company_data.items():
    articles = data["Articles"]
    comparisons = []
    topic_overlap = {"Common Topics": [], "Unique Topics in Articles": []}

    for i in range(len(articles) - 1):
        article1, article2 = articles[i], articles[i + 1]
        comparison = {
            "Comparison": f"Article {i+1} discusses {article1['Topics']} while Article {i+2} covers {article2['Topics']}",
            "Impact": "Shows difference in focus across articles."
        }
        comparisons.append(comparison)
        topic_overlap["Common Topics"] = list(set(article1["Topics"]) & set(article2["Topics"]))
        topic_overlap["Unique Topics in Articles"].append({
            f"Unique Topics in Article {i+1}": list(set(article1["Topics"]) - set(article2["Topics"])) ,
            f"Unique Topics in Article {i+2}": list(set(article2["Topics"]) - set(article1["Topics"]))
        })

    data["Comparative Sentiment Score"] = {
        "Sentiment Distribution": dict(data["Sentiment Distribution"]),
        "Coverage Differences": comparisons,
        "Topic Overlap": topic_overlap
    }
    sentiment_summary = "positive" if data["Sentiment Distribution"]["Positive"] > data["Sentiment Distribution"]["Negative"] else "negative"
    data["Final Sentiment Analysis"] = f"Company’s latest news coverage leans towards {sentiment_summary}."

# Convert sets to lists before JSON serialization
output_data = {
    company: {
        **data,
        "Comparative Sentiment Score": {
            **data["Comparative Sentiment Score"],
            "Topic Overlap": {k: list(v) if isinstance(v, set) else v for k, v in data["Comparative Sentiment Score"]["Topic Overlap"].items()}
        }
    }
    for company, data in company_data.items()
}

# Function to convert text to Hindi speech
def text_to_speech(text, filename="output.mp3"):
    translated_text = GoogleTranslator(source="auto", target="hi").translate(text)
    tts = gTTS(translated_text, lang="hi")

    # Ensure static/audio directory exists
    audio_folder = "static/audio"
    os.makedirs(audio_folder, exist_ok=True)

    file_path = os.path.join(audio_folder, filename)
    tts.save(file_path)

    print(f"✅ Audio saved at {file_path}")
    return file_path  # Return the correct file path


# Extract text and generate audio for a company
def generate_audio_for_company(company_data):
    audio_files = {}
    for company, data in company_data.items():
        text_content = f"{company} की ताज़ा खबरें:\n"
        for article in data["Articles"]:
            text_content += f"\nशीर्षक: {article['Title']}\nसारांश: {article['Summary']}\n"
        for comparison in data["Comparative Sentiment Score"]["Coverage Differences"]:
            text_content += f"\nतुलना: {comparison['Comparison']}\nप्रभाव: {comparison['Impact']}\n"

        audio_filename = f"{company}_news.mp3"
        text_to_speech(text_content, audio_filename)
        audio_files[company] = os.path.abspath(audio_filename)  # Store file path
    return audio_files


# Define route to fetch company data
@app.route('/get_company_data', methods=['GET'])
def get_company_data():
    user_input = request.args.get('company_name', '').strip().lower()
    company_found = None

    for company in company_data.keys():
        if company.lower() == user_input:
            company_found = company
            break

    if company_found:
        # Generate the audio only if it does not already exist
        if "audio_file" not in company_data[company_found]:
            audio_files = generate_audio_for_company({company_found: company_data[company_found]})
            # company_data[company_found]["audio_file"] = audio_files.get(company_found, None)
            if company_found in audio_files:
              audio_url = f"{request.host_url}static/audio/{company_found}_news.mp3"
              company_data[company_found]["audio_file"] = audio_url

        return jsonify({
            "company_data": output_data[company_found],
            "audio_file": company_data[company_found].get("audio_file", None)  # Send stored file path
        })

    else:
        return jsonify({"error": "No data found for the entered company. Please check the spelling."})


# Start Flask app
if __name__ == '__main__':
    # Set up ngrok tunnel for Flask app
    public_url = ngrok.connect(5000)
    print(f" * Flask app is running on {public_url}")

    # Run the Flask app
    app.run(port=5000)
