In [12]:
!pip install pandas scikit-learn openai
!pip install openai==1.3.7
!pip install pandas scikit-learn
!pip install selenium==4.15.2
!pip install groq



In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import os

# Setup Chrome options (visible browser)
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)

# Set your correct ChromeDriver path
service = Service(r"C:\Users\UseR\Intern Task\Task\chromedriver-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)

try:
    # Open the Worldometer page
    driver.get("https://www.worldometers.info/coronavirus/")
    print("🌐 Opened website, waiting for table to load...")

    # Wait for table to fully load
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, "main_table_countries_today"))
    )

    time.sleep(5)  # Extra wait for JS-rendered content

    # Scrape the table
    table = driver.find_element(By.ID, "main_table_countries_today")
    rows = table.find_elements(By.TAG_NAME, "tr")

    data = []
    for row in rows:
        # Skip rows with style="display: none" (continent totals)
        if "display: none" not in row.get_attribute("style"):
            cols = row.find_elements(By.TAG_NAME, "td")
            if len(cols) >= 9:  # Ensure row has enough columns
                data.append({
                    "Rank": cols[0].text.strip(),
                    "Country": cols[1].text.strip(),
                    "Total Cases": cols[2].text.strip(),
                    "Total Deaths": cols[3].text.strip(),
                    "Total Recovered": cols[4].text.strip(),
                    "Tot Cases/1M pop": cols[5].text.strip(),
                    "Deaths/1M pop": cols[6].text.strip(),
                    "Population": cols[7].text.strip(),
                    "Continent": cols[8].text.strip()
                })

    # Save data to CSV safely
    filename = "worldometers_covid1.csv"
    if os.path.exists(filename):
        try:
            os.remove(filename)
        except:
            print(f"⚠️ File '{filename}' is open. Please close it and rerun.")
            driver.quit()
            exit()

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"✅ Data saved to {filename}")

finally:
    driver.quit()
    print("🚪 Browser closed.")

🌐 Opened website, waiting for table to load...
✅ Data saved to worldometers_covid1.csv
🚪 Browser closed.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
import re
from typing import List, Dict

# Initialize Groq client
client = Groq(api_key="api_key")  # Replace with your Groq API key

# Load and preprocess CSV data
def load_and_preprocess_data(file_path: str = "worldometers_covid1.csv") -> pd.DataFrame:
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"CSV file '{file_path}' not found. Please run the scraper first.")

    # Verify required columns
    expected_columns = ["Rank", "Country", "Total Cases", "Total Deaths", "Total Recovered", "Tot Cases/1M pop", "Deaths/1M pop", "Population", "Continent"]
    missing_columns = [col for col in expected_columns if col not in df.columns]
    if missing_columns:
        raise KeyError(f"Missing required columns in CSV: {missing_columns}")

    # Clean numerical columns
    numerical_columns = ["Total Cases", "Total Deaths", "Total Recovered", "Tot Cases/1M pop", "Deaths/1M pop", "Population"]
    for col in numerical_columns:
        df[col] = df[col].replace({",": "", "N/A": None}, regex=True).astype(float, errors="ignore")

    # Create search-friendly country names
    df["search_country"] = df["Country"].str.lower().replace({
        "us": "usa", "united states": "usa", "uk": "united kingdom",
        "south korea": "s. korea", "north korea": "dprk"
    })

    # Create combined_text for vectorization, weighting Country higher
    df["combined_text"] = df.apply(
        lambda row: f"{row['Country']} {row['Country']} {row['Country']} | {row['Continent']} | " +
        " | ".join(str(v) for v in row[["Rank", "Total Cases", "Total Deaths", "Total Recovered", "Tot Cases/1M pop", "Deaths/1M pop", "Population"]]),
        axis=1
    )
    return df

try:
    df = load_and_preprocess_data()
except Exception as e:
    print(f"Error loading data: {str(e)}")
    exit()

# Enhanced retrieval function
def retrieve_top_k(query: str, k: int = 5) -> List[Dict]:
    query_lower = query.lower()

    # Query analysis
    continents = ["europe", "north america", "asia", "south america", "africa", "australia/oceania"]
    is_continent_query = any(cont in query_lower for cont in continents)
    is_world_query = "world" in query_lower
    is_comparison = any(word in query_lower for word in ["highest", "lowest", "most", "least", "top", "compare"])
    country_matches = [c for c in df["search_country"].unique() if c in query_lower]

    # Filter dataframe based on query intent
    if is_world_query:
        df_subset = df[df["Country"].str.lower() == "world"]
    elif is_continent_query:
        continent_name = next((c for c in continents if c in query_lower), None)
        if continent_name:
            # Map query continent to CSV continent
            continent_name = "Australia/Oceania" if continent_name == "australia/oceania" or continent_name == "oceania" else continent_name.title()
            df_subset = df[df["Continent"] == continent_name]
        else:
            df_subset = df
        if country_matches:
            df_subset = df_subset[df_subset["search_country"].isin(country_matches)]
    elif country_matches:
        df_subset = df[df["search_country"].isin(country_matches)]
    else:
        df_subset = df[df["Country"] != "World"]  # Exclude World for general queries

    if df_subset.empty:
        return []

    # Vectorize with focus on relevant fields
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(df_subset["combined_text"].tolist() + [query])
    similarity = cosine_similarity(matrix[-1], matrix[:-1]).flatten()
    top_indices = similarity.argsort()[-k:][::-1]

    # Sort for comparison queries
    if is_comparison:
        if "cases" in query_lower:
            top_indices = df_subset.iloc[top_indices].sort_values("Total Cases", ascending=False).index
        elif "deaths" in query_lower:
            top_indices = df_subset.iloc[top_indices].sort_values("Total Deaths", ascending=False).index
        elif "deaths/1m" in query_lower or "deaths per million" in query_lower:
            top_indices = df_subset.iloc[top_indices].sort_values("Deaths/1M pop", ascending=False).index

    # Return top-k records
    return df_subset.iloc[top_indices][["Rank", "Country", "Total Cases", "Total Deaths", "Total Recovered", "Tot Cases/1M pop", "Deaths/1M pop", "Population", "Continent"]].to_dict("records")

# RAG query function
def rag_query(question: str, context_list: List[Dict]) -> str:
    if not question:
        return "Please enter a valid question."

    if not context_list:
        return "Sorry, I couldn't find relevant data to answer your question. Try specifying a country or continent."

    # Define context string
    context = "Data (Country | Continent | Total Cases | Total Deaths | Total Recovered | Total Cases/1M Pop | Deaths/1M Pop | Population):\n"
    for item in context_list:
        context += f"{item['Country']} | {item['Continent']} | {item['Total Cases']:,} | {item['Total Deaths']:,} | {item['Total Recovered']:,} | {item['Tot Cases/1M pop']:,} | {item['Deaths/1M pop']:,} | {item['Population']:,}\n"

    # Optimized prompt for RAG
    prompt = f"""You are an expert assistant answering questions about COVID-19 data using only the provided dataset. Follow these rules:
- Answer concisely and precisely, using exact numbers formatted with commas (e.g., 1,234,567).
- For comparisons (e.g., 'highest', 'top'), rank the data and provide the top result(s) with values.
- For 'compare' queries, list metrics side by side.
- If a value is None, state 'Data unavailable'.
- For continent queries, note that only country-level data is available. Aggregate data for countries in the specified continent if needed, or state if aggregation is not possible.
- If the question is unanswerable, state: 'This question cannot be answered with the provided data.'
- Do not speculate or use external information.

Examples:
Q: Total cases in USA?
A: The USA has 111,820,082 total COVID-19 cases.

Q: Which country has the highest cases?
A: The USA has the highest total cases with 111,820,082.

Q: Compare deaths in USA and India.
A: USA: 1,219,487 deaths | India: 533,570 deaths.

Q: Total cases in Europe?
A: The total COVID-19 cases in Europe cannot be directly calculated from country-level data alone, as continent totals are not provided. Please specify a country or another metric.

Q: Cases in Narnia?
A: This question cannot be answered with the provided data.

Data:
{context}

Question: {question}
Answer:"""

    try:
        response = client.chat.completions.create(
            model="llama3-70b-8192",  # More powerful model for precision
            messages=[{"role": "user", "content": prompt}],
            max_tokens=250,
            temperature=0.3  # Lower temperature for factual responses
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"Error generating response: {str(e)}"

# RAG chatbot
def rag_chatbot(verbose: bool = False):
    print("RAG Chatbot for COVID-19 Data (type 'exit' to quit)")
    while True:
        user_question = input("You: ").strip()
        if user_question.lower() == "exit":
            break
        if not user_question:
            print("Bot: Please enter a valid question.")
            continue

        # Retrieve context
        relevant_context = retrieve_top_k(user_question)
        if not relevant_context:
            print("Bot: No relevant data found. Try specifying a country, continent, or metric (e.g., cases, deaths).")
            continue

        # Verbose output
        if verbose:
            print("Debug: Retrieved context:")
            for item in relevant_context:
                print({k: f"{v:,}" if isinstance(v, float) else v for k, v in item.items()})

        # Generate response
        response = rag_query(user_question, relevant_context)
        print("Answer:", response)

# Run the chatbot
if __name__ == "__main__":
    rag_chatbot(verbose=True)

RAG Chatbot for COVID-19 Data (type 'exit' to quit)
