<a href="https://colab.research.google.com/github/OmriAbdalla/computingCloudCourse/blob/main/cloud_tut6_firstWorkWithSearchEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libraries: 'requests' for HTTP requests, 'beautifulsoup4' for parsing HTML
!pip install requests beautifulsoup4

# Import the necessary modules
import requests
from bs4 import BeautifulSoup

# Function to fetch and parse the content of a web page
def fetch_page(url):
    # Send a GET request to the specified URL
    response = requests.get(url)

    # If the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        # If the request failed, return None
        return None




In [None]:
import re

# Function to index and count all words in the text extracted from a BeautifulSoup object
def index_words(soup):
    index = {}  # Dictionary to store word frequencies

    # Extract all words using regex (only alphanumeric words) from the visible text of the page
    words = re.findall(r'\w+', soup.get_text())

    # Loop through each word
    for word in words:
        word = word.lower()  # Convert word to lowercase for uniformity
        if word in index:
            index[word] += 1  # If the word already exists, increment its count
        else:
            index[word] = 1   # Otherwise, add it to the dictionary with count 1

    return index  # Return the dictionary of word counts


In [None]:
def remove_stop_words(index):
    """
    Removes common stop words from a word frequency dictionary.

    Parameters:
        index (dict): A dictionary where keys are words (str) and values are their frequency counts (int).

    Returns:
        dict: The same dictionary with specified stop words removed.
    """
    stop_words = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at'}
    for stop_word in stop_words:
        if stop_word in index:
            del index[stop_word]
    return index


In [None]:
from nltk.stem import PorterStemmer

def apply_stemming(index):
    """
    Applies stemming to all words in a word frequency dictionary using the Porter stemming algorithm.

    Parameters:
        index (dict): A dictionary where keys are words (str) and values are their frequency counts (int).

    Returns:
        dict: A new dictionary where words have been reduced to their stems,
              combining the counts of words with the same stem.
    """
    stemmer = PorterStemmer()
    stemmed_index = {}
    for word, count in index.items():
        stemmed_word = stemmer.stem(word)
        if stemmed_word in stemmed_index:
            stemmed_index[stemmed_word] += count
        else:
            stemmed_index[stemmed_word] = count
    return stemmed_index


In [None]:
def search(query, index):
  """
    Searches for words from the query in the provided index.

    Parameters:
    - query (str): The search query string containing one or more words.
    - index (dict): A dictionary where keys are words and values are their frequencies.

    Returns:
    - dict: A dictionary containing only the words from the query that are found in the index,
            along with their corresponding frequencies.
    """
  query_words = re.findall(r'\w+', query.lower())
  results = {}
  for word in query_words:
    if word in index:
      results[word] = index[word]
  return results

In [None]:
def search_engine(url, query):
  """
    A simple search engine that retrieves and processes text from a web page,
    and searches for a query within the processed content.

    Steps:
    1. Fetches the web page content using the provided URL.
    2. Extracts and indexes the words from the page.
    3. Removes common stop words.
    4. Applies stemming to normalize word forms.
    5. Searches for the query terms in the processed index.

    Parameters:
    - url (str): The URL of the web page to fetch and analyze.
    - query (str): The search query string.

    Returns:
    - dict: A dictionary of the query terms found in the web page and their frequencies.
            Returns None if the page could not be fetched.
    """
  soup = fetch_page(url)
  if soup is None:
    return None
  index = index_words(soup)
  index = remove_stop_words(index)
  index = apply_stemming(index)
  results = search(query, index)
  return results

In [None]:
# Define the URL of the web page to scrape
url = 'https://en.wikipedia.org/wiki/Bird'

# Define the search query
query = 'birds wings'

# Run the custom search engine on the given URL and query
results = search_engine(url, query)

# Print the matching results (stemmed words and their frequency)
print(results)


{}


In [None]:
url = 'https://en.wikipedia.org/wiki/Bird'
query = 'birds wings'
results = search_engine(url, query)
print(results)

{}


In [None]:
def search(query, index):
    """
    Search for stemmed query words in the given index.

    Parameters:
    query (str): The search query string entered by the user.
    index (dict): A dictionary mapping stemmed words to their frequency counts.

    Returns:
    dict: A dictionary containing the stemmed query words found in the index
          along with their corresponding counts.
    """
    stemmer = PorterStemmer()
    query_words = re.findall(r'\w+', query.lower())
    results = {}
    for word in query_words:
        word = stemmer.stem(word)
        if word in index:
            results[word] = index[word]
    return results


In [None]:
url = 'https://en.wikipedia.org/wiki/Bird'
query = 'birds wings'
results = search_engine(url, query)
print(results)

{'bird': 574, 'wing': 25}


In [None]:
rank = 1  # Initialize rank to 1

# Iterate over each word and its count in the results dictionary
for word, count in results.items():
    rank = rank * (1 / count)  # Multiply rank by the reciprocal of the count

rank = 1 - rank  # Subtract the product from 1 to get the final rank value


In [None]:
url = 'https://en.wikipedia.org/wiki/Bird'
query = 'birds wings'
results = search_engine(url, query)
print(results)
rank=1
for word, count in results.items():
   rank = rank*1/count
rank = 1-rank
print(rank)

In [None]:
url = 'https://en.wikipedia.org/wiki/Bird'
query = 'collage students'
results = search_engine(url, query)
print(results)
rank=1
for word, count in results.items():
 rank = rank*1/count
rank = 1-rank
print(rank)


In [None]:
url = 'https://en.wikipedia.org/wiki/Bird'
query = 'owls'
results = search_engine(url, query)
print(results)
rank=1
for word, count in results.items():
 rank = rank*1/count
rank = 1-rank
print(rank)

In [None]:
url = 'https://en.wikipedia.org/wiki/Bird'
query = 'Industry'
results = search_engine(url, query)
print(results)
rank=1
for word, count in results.items():
 rank = rank*1/count
rank = 1-rank
print(rank)



```
# This is formatted as code
```

### **מנוע המיועד למספר דפים**

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
class WikiSearchEngine:
  def __init__(self):
    """Initialize the search engine"""
    self.base_url = "https://en.wikipedia.org/w/api.php"
    self.pages = []
    self.word_locations = defaultdict(list) # word -> [(page_id, frequency), ...]
    self.stop_words = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at', 'to', 'for', 'of', 'with'}
    return False
  def fetch_wiki_pages(self, topic, num_pages=5):
    """Fetch Wikipedia pages for given topic"""
    search_params = {
      "action": "query",
      "format": "json",
      "list": "search",
      "srsearch": topic,
      "srlimit": num_pages
    }
    try:
      response = requests.get(self.base_url, params=search_params)
      search_results = response.json()['query']['search']

      for result in search_results:
        content_params = {
            "action": "query",
            "format": "json",
            "prop": "extracts|info",
            "pageids": result['pageid'],
            "inprop": "url",
            "explaintext": True
        }
        content_response = requests.get(self.base_url, params=content_params)
        page_data = content_response.json()['query']['pages'][str(result['pageid'])]
        self.pages.append({
          'id': result['pageid'],
          'title': page_data['title'],
          'url': page_data.get('fullurl', f"https://en.wikipedia.org/?curid={result['pageid']}"),
          'content': page_data['extract']
        })
      print(f"Retrieved: {page_data['title']}")
      return True

    except Exception as e:
      print(f"Error fetching pages: {str(e)}")

  def build_index(self):
        """Build a simple word location index"""
        self.word_locations.clear()

        # Process each page
        for page in self.pages:
            # Get all words from content
            words = re.findall(r'\w+', page['content'].lower())

            # Count word frequencies
            word_counts = defaultdict(int)
            for word in words:
                if word not in self.stop_words:
                    word_counts[word] += 1

            # Add to index with page information
            for word, count in word_counts.items():
                self.word_locations[word].append((page['id'], count))

  def search(self, query, num_results=5):
        """Search pages using simple word frequency ranking.
        Ranks pages based on:1. Number of query words found in the page
        2. Total frequency of query words  """
        # Get query words
        query_words = [word.lower() for word in re.findall(r'\w+', query)
                    if word.lower() not in self.stop_words]
        if not query_words:
            return []

        # Calculate scores for each page
        page_scores = defaultdict(lambda: {'matches': 0, 'total_freq': 0})

        # For each query word
        for word in query_words:
            # Find pages containing this word
            for page_id, freq in self.word_locations.get(word, []):
                page_scores[page_id]['matches'] += 1
                page_scores[page_id]['total_freq'] += freq


        # Convert to list and sort
        ranked_results = [
            (page_id, scores['matches'], scores['total_freq'])
            for page_id, scores in page_scores.items()
        ]
        # Sort by number of matching words first, then by total frequency
        ranked_results.sort(key=lambda x: (x[1], x[2]), reverse=True)
        # Format results
        results = []
        for page_id, matches, total_freq in ranked_results[:num_results]:
            page = next(p for p in self.pages if p['id'] == page_id)
            # Find the first matching word context
            context = self.get_context(page['content'], query_words)
            results.append({
                'title': page['title'],
                'url': page['url'],
                'matching_words': matches,
                'total_frequency': total_freq,
                'context': context
            })
        return results


In [None]:
def fetch_wiki_pages(self, topic, num_pages=5):
  """Fetch Wikipedia pages for given topic"""
  search_params = {
  "action": "query",
  "format": "json",
  "list": "search",
  "srsearch": topic,
  "srlimit": num_pages
  }
  try:
      response = requests.get(self.base_url, params=search_params)
      search_results = response.json()['query']['search']

      for result in search_results:
        content_params = {
            "action": "query",
            "format": "json",
            "prop": "extracts|info",
            "pageids": result['pageid'],
            "inprop": "url",
            "explaintext": True
        }
        content_response = requests.get(self.base_url, params=content_params)
        page_data = content_response.json()['query']['pages'][str(result['pageid'])]
        self.pages.append({
          'id': result['pageid'],
          'title': page_data['title'],
          'url': page_data.get('fullurl', f"https://en.wikipedia.org/?curid={result['pageid']}"),
          'content': page_data['extract']
        })
        print(f"Retrieved: {page_data['title']}")
      return True

  except Exception as e:
    print(f"Error fetching pages: {str(e)}")


In [None]:
    def build_index(self):
        """Build a simple word location index"""
        self.word_locations.clear()

        # Process each page
        for page in self.pages:
            # Get all words from content
            words = re.findall(r'\w+', page['content'].lower())

            # Count word frequencies
            word_counts = defaultdict(int)
            for word in words:
                if word not in self.stop_words:
                    word_counts[word] += 1

            # Add to index with page information
            for word, count in word_counts.items():
                self.word_locations[word].append((page['id'], count))

    def search(self, query, num_results=5):
        """Search pages using simple word frequency ranking.
        Ranks pages based on:1. Number of query words found in the page
        2. Total frequency of query words  """
        # Get query words
        query_words = [word.lower() for word in re.findall(r'\w+', query)
                      if word.lower() not in self.stop_words]
        if not query_words:
            return []

        # Calculate scores for each page
        page_scores = defaultdict(lambda: {'matches': 0, 'total_freq': 0})

        # For each query word
        for word in query_words:
            # Find pages containing this word
            for page_id, freq in self.word_locations.get(word, []):
                page_scores[page_id]['matches'] += 1
                page_scores[page_id]['total_freq'] += freq


        # Convert to list and sort
        ranked_results = [
            (page_id, scores['matches'], scores['total_freq'])
            for page_id, scores in page_scores.items()
        ]
        # Sort by number of matching words first, then by total frequency
        ranked_results.sort(key=lambda x: (x[1], x[2]), reverse=True)
        # Format results
        results = []
        for page_id, matches, total_freq in ranked_results[:num_results]:
            page = next(p for p in self.pages if p['id'] == page_id)
            # Find the first matching word context
            context = self.get_context(page['content'], query_words)
            results.append({
                'title': page['title'],
                'url': page['url'],
                'matching_words': matches,
                'total_frequency': total_freq,
                'context': context
            })
        return results