# ***Data Scraping***

In [2]:
!pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup

def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        return None





# ***Data Indexing***

In [3]:
import re
def index_words(soup):
  index = {} # a dictionary
  words = re.findall(r'\w+', soup.get_text())
  for word in words:
    word = word.lower()
    if word in index:
      index[word] += 1
    else:
      index[word] = 1
  return index


# ***Text Processing***

In [4]:
def remove_stop_words(index):
  stop_words = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at'}
  for stop_word in stop_words:
    if stop_word in index:
      del index[stop_word]
  return index


In [5]:
from nltk.stem import PorterStemmer

def apply_stemming(index):
  stemmer = PorterStemmer()
  stemmed_index = {}
  for word, count in index.items():
    stemmed_word = stemmer.stem(word)
    if stemmed_word in stemmed_index:
      stemmed_index[stemmed_word] += count
    else:
      stemmed_index[stemmed_word] = count
  return stemmed_index


# ***Search Query***

In [None]:
def search(query, index):
  query_words = re.findall(r'\w+', query.lower())
  print(f"Query: {query_words}")
  results = {}
  for word in query_words:
    if word in index:
      results[word] = index[word]
  return results


In [None]:
def search2(query, index):
  stemmer = PorterStemmer()
  query_words = re.findall(r'\w+', query.lower())
  results = {}
  for word in query_words:
    word = stemmer.stem(word)
    if word in index:
      results[word] = index[word]
  return results


# ***Full search engine***

In [None]:
def search_engine(url, query):
  soup = fetch_page(url)
  if soup is None:
    return None
  index = index_words(soup)
  index = remove_stop_words(index)
  index = apply_stemming(index)
  results = search(query, index)
  return results


In [None]:
def search_engine2(url, query):
  soup = fetch_page(url)
  if soup is None:
    return None
  index = index_words(soup)
  index = remove_stop_words(index)
  index = apply_stemming(index)
  results = search2(query, index)
  return results


# ***Braude***

In [None]:
url = 'https://w3.braude.ac.il/?lang=en'
query = 'Industry'
results = search_engine2(url, query)
print(results)
rank=1
for word, count in results.items():
  rank = rank*1/count
rank = 1-rank
print(rank)

{'industri': 8}
0.875


In [None]:
url = 'https://w3.braude.ac.il/?lang=en'
query = 'Braude college'
results = search_engine2(url, query)
print(results)
rank=1
for word, count in results.items():
  rank = rank*1/count
rank = 1-rank
print(rank)

{'braud': 13, 'colleg': 8}
0.9903846153846154


In [None]:
url = 'https://w3.braude.ac.il/?lang=en'
query = 'Galilee center'
results = search_engine2(url, query)
print(results)
rank=1
for word, count in results.items():
  rank = rank*1/count
rank = 1-rank
print(rank)

{'galile': 15, 'center': 4}
0.9833333333333333


# ***Search engine for more than one page***

In [20]:
import requests
from collections import defaultdict

class WikiSearchEngine:
    def __init__(self):
        """Initialize the search engine"""
        self.base_url = "https://en.wikipedia.org/w/api.php"
        self.pages = []
        self.word_locations = defaultdict(list)  # word -> [(page_id, frequency), ...]
        self.stop_words = {'a', 'an', 'the', 'and', 'or', 'in', 'on', 'at', 'to', 'for', 'of', 'with'}

    def fetch_wiki_pages(self, topic, num_pages=5):
        """Fetch Wikipedia pages for given topic"""
        search_params = { # This is a dictionary that defines the parameters for the search request sent to Wikipedia:
            "action": "query", # This indicates that we're performing a query action.
            "format": "json", # The response will be in JSON format.
            "list": "search", # Specifies that we want to perform a search.
            "srsearch": topic, # This is the search term based on the topic parameter.
            "srlimit": num_pages  # Limit to the first num_pages results
        }

        try:
            # Send the request to Wikipedia API
            response = requests.get(self.base_url, params=search_params)
            # extract the search results (response.json()['query']['search']), which include basic information such as pageid and title.
            search_results = response.json()['query']['search']

            # For each search result, fetch the content of the page
            for result in search_results:
                content_params = {
                    "action": "query",
                    "format": "json",
                    "prop": "extracts|info", # To get both plain text extracts and metadata (e.g., page URL).
                    "pageids": result['pageid'], # To get the URL of the page.
                    "inprop": "url",
                    "explaintext": True  # Get plain text extract
                }
                content_response = requests.get(self.base_url, params=content_params)
                page_data = content_response.json()['query']['pages'][str(result['pageid'])]

                # Append the page info to self.pages
                self.pages.append({
                    'id': result['pageid'],
                    'title': page_data['title'],
                    'url': page_data.get('fullurl', f"https://en.wikipedia.org/?curid={result['pageid']}"),
                    'content': page_data['extract']
                })
                print(f"Retrieved: {page_data['title']}")

            # Return the fetched pages after all are retrieved
            return self.pages

        except Exception as e:
            print(f"Error fetching pages: {str(e)}")
            return None

    def build_index(self):
        """Build a simple word location index"""
        self.word_locations.clear()  # Clear any existing index

        # Process each page in the stored pages
        for page in self.pages:
            # Get all words from the content of the page
            words = re.findall(r'\w+', page['content'].lower())  # Extract words and make them lowercase

            # Count word frequencies (excluding stop words)
            word_counts = defaultdict(int) # whenever you access a key that hasn’t been added yet, it will automatically be assigned the value 0.
            for word in words:
                if word not in self.stop_words:
                    word_counts[word] += 1

            # Add the word counts to the index with page information
            for word, count in word_counts.items():
                self.word_locations[word].append((page['id'], count))

    # Example method to display the index (for testing purposes)
    def display_index(self):
        for word, locations in self.word_locations.items():
            print(f"Word: {word}")
            for page_id, count in locations:
                print(f"  Page ID: {page_id}, Count: {count}")

    def search(self, query, num_results=5):
      """ Search pages using simple word frequency ranking.
      Ranks pages based on:1. Number of query words found in the page
      2. Total frequency of query words """

      # Get query words
      query_words = [word.lower() for word in re.findall(r'\w+', query)
      if word.lower() not in self.stop_words]
      if not query_words:
        return []
      # Calculate scores for each page
      page_scores = defaultdict(lambda: {'matches': 0, 'total_freq': 0})
      # For each query word
      for word in query_words:
        # Find pages containing this word
        for page_id, freq in self.word_locations.get(word, []):
          page_scores[page_id]['matches'] += 1
          page_scores[page_id]['total_freq'] += freq

      # Convert to list and sort
      ranked_results = [
        (page_id, scores['matches'], scores['total_freq'])
        for page_id, scores in page_scores.items()
      ]

      # Sort by number of matching words first, then by total frequency
      ranked_results.sort(key=lambda x: (x[1], x[2]), reverse=True)

      # Format results
      results = []
      for page_id, matches, total_freq in ranked_results[:num_results]:
        page = next(p for p in self.pages if p['id'] == page_id)
        # Find the first matching word context
        context = self.get_context(page['content'], query_words)
        results.append({
          'title': page['title'],
          'url': page['url'],
          'matching_words': matches,
          'total_frequency': total_freq,
          'context': context
          })

      return results



In [8]:
# Example usage:
engine = WikiSearchEngine()

# Fetch Wikipedia pages related to "Bird"
pages = engine.fetch_wiki_pages("Bird", num_pages=3)

# Print out the titles and URLs of the fetched pages
if pages:
    for page in pages:
        print(f"Title: {page['title']}")
        print(f"URL: {page['url']}")
        print(f"Content: {page['content'][:300]}...")  # Print first 300 characters of content
        print("\n")


Retrieved: Bird
Retrieved: Bird (disambiguation)
Retrieved: Bird & Bird
Title: Bird
URL: https://en.wikipedia.org/wiki/Bird
Content: Birds are a group of warm-blooded vertebrates constituting the class Aves (Latin: [ˈaveːs]), characterised by feathers, toothless beaked jaws, the laying of hard-shelled eggs, a high metabolic rate, a four-chambered heart, and a strong yet lightweight skeleton. Birds live worldwide and range in size...


Title: Bird (disambiguation)
URL: https://en.wikipedia.org/wiki/Bird_(disambiguation)
Content: A bird is a feathered, winged, bipedal, warm-blooded, egg-laying, vertebrate.
Bird, BIRD, or the bird may also refer to:


== Arts and entertainment ==


=== Fictional characters ===
Tracy "Bird" Van Adams, in the Soul Food film and TV series
Bird, of the Barksdale Organization in TV series The Wire...


Title: Bird & Bird
URL: https://en.wikipedia.org/wiki/Bird_%26_Bird
Content: Bird & Bird is an international law firm that was founded in London in 1846. The fi