<a href="https://colab.research.google.com/github/VenuPallela/React-Shopping-UI/blob/main/KnowledgeAlignmentPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

def fetch_wikipedia_knowledge(target):
    """
    Fetch a summary of the target from Wikipedia using its API.
    """
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{target}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('extract', 'No summary available')
    else:
        return 'Wikipedia fetch failed'

from bs4 import BeautifulSoup
import requests

def fetch_google_results(target):
    """
    Fetch Google search results snippets.
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    query = f"https://www.google.com/search?q={target}"
    response = requests.get(query, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        snippets = [tag.text for tag in soup.find_all("span", class_="aCOpRe")]
        return snippets
    else:
        return ["Google search fetch failed"]

from sklearn.feature_extraction.text import TfidfVectorizer

def rank_and_filter_snippets(target, snippets):
    """
    Use TF-IDF to rank Google snippets for relevance to the target.
    """
    documents = [target] + snippets
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)
    scores = tfidf_matrix.toarray()[0, 1:]  # Scores for snippets
    ranked_snippets = sorted(zip(scores, snippets), reverse=True)
    return [snippet for _, snippet in ranked_snippets[:5]]  # Top 5 snippets

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_similarity_verification(target, wiki_summary, snippets):
    """
    Compute cosine similarity between target embeddings and text sources.
    """
    texts = [wiki_summary] + snippets
    embeddings = model.encode([target] + texts)
    similarities = cosine_similarity([embeddings[0]], embeddings[1:])[0]

    max_index = similarities.argmax()
    best_source = "Wikipedia" if max_index == 0 else f"Google Snippet {max_index}"
    best_text = texts[max_index]

    return best_source, best_text

def knowledge_alignment_pipeline(target):
    """
    Orchestrates the entire pipeline for a target.
    """
    print(f"Processing: {target}")

    # Step 1: Fetch Wikipedia Knowledge
    wiki_summary = fetch_wikipedia_knowledge(target)

    # Step 2: Fetch Google Search Results
    google_snippets = fetch_google_results(target)

    # Step 3: Rank and Filter Google Snippets
    ranked_snippets = rank_and_filter_snippets(target, google_snippets)

    # Step 4: Semantic Similarity Verification
    best_source, best_text = semantic_similarity_verification(target, wiki_summary, ranked_snippets)

    return {
        "Target": target,
        "Best Source": best_source,
        "Knowledge": best_text,
        "Wikipedia Summary": wiki_summary,
        "Google Snippets": ranked_snippets
    }

if __name__ == "__main__":
    target = "Climate Change"
    result = knowledge_alignment_pipeline(target)
    print(result)

import csv

def process_targets_from_csv(file_path):
    """
    Process multiple targets from a CSV file.
    """
    results = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        for row in reader:
            target = row[0]
            result = knowledge_alignment_pipeline(target)
            results.append(result)
    return results

Processing: Climate Change
{'Target': 'Climate Change', 'Best Source': 'Wikipedia', 'Knowledge': "Present-day climate change includes both global warming—the ongoing increase in global average temperature—and its wider effects on Earth's climate. Climate change in a broader sense also includes previous long-term changes to Earth's climate. The current rise in global temperatures is driven by human activities, especially fossil fuel burning since the Industrial Revolution. Fossil fuel use, deforestation, and some agricultural and industrial practices release greenhouse gases. These gases absorb some of the heat that the Earth radiates after it warms from sunlight, warming the lower atmosphere. Carbon dioxide, the primary greenhouse gas driving global warming, has grown by about 50% and is at levels not seen for millions of years.", 'Wikipedia Summary': "Present-day climate change includes both global warming—the ongoing increase in global average temperature—and its wider effects on Ear