In [29]:
# Install libraries if not already installed
#!pip install requests beautifulsoup4 nltk

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import json
import nltk
import re
import math
from collections import defaultdict

# Download NLTK stopwords and punkt tokenizer
#nltk.download("stopwords")
#nltk.download("punkt")


In [31]:
# Initialize stop words and stemmer
stop_words = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.PorterStemmer()


In [33]:
# Prompt the user for Wikipedia page titles
print("Enter Wikipedia page titles separated by commas (e.g., 'Artificial Intelligence, Machine Learning, Data Science'):")
page_titles_input = input().split(',')

# Clean up and prepare page titles list
page_titles = [title.strip() for title in page_titles_input]

# Function to Scrape Wikipedia Pages
def get_wikipedia_page_content(page_title):
    url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    paragraphs = soup.find_all("p")
    
    # Extract text content from paragraphs
    content = " ".join([para.text for para in paragraphs if para.text])
    return {"title": page_title, "content": content}

# Fetch documents for each title provided by the user
documents = [get_wikipedia_page_content(title) for title in page_titles]

# Save raw documents to JSON file
with open("wikipedia_data.json", "w") as f:
    json.dump(documents, f)

# Display fetched documents to verify content
documents[:1]  # Show first document as an example



Enter Wikipedia page titles separated by commas (e.g., 'Artificial Intelligence, Machine Learning, Data Science'):


 Greece , Europe , Albania


[{'title': 'Greece',
  'content': '\n –\xa0in Europe\xa0(light green &\xa0dark grey)–\xa0in the European Union\xa0(light green) Greece,[a] officially the Hellenic Republic,[b] is a country in Southeast Europe. Located on the southern tip of the Balkan peninsula, Greece shares land borders with Albania to the northwest, North Macedonia and Bulgaria to the north, and Turkey to the east. The Aegean Sea lies to the east of the mainland, the Ionian Sea to the west, and the Sea of Crete and the Mediterranean Sea to the south. Greece has the longest coastline on the Mediterranean Basin, featuring thousands of islands. The country comprises nine traditional geographic regions, and has a population of over 10.4 million. Athens is the nation\'s capital and largest city, followed by Thessaloniki and Patras.\n Greece is considered the cradle of Western civilization, being the birthplace of democracy, Western philosophy, Western literature, historiography, political science, major scientific and ma

In [9]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase and remove special characters
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    
    # Tokenize, remove stop words, and apply stemming
    tokens = nltk.word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to each document and save to processed_data.json
processed_documents = [{"title": doc["title"], "content": preprocess_text(doc["content"])} for doc in documents]

with open("processed_data.json", "w") as f:
    json.dump(processed_documents, f)

# Display processed documents to verify
processed_documents[:1]  # Show first processed document as an example


[{'title': 'Somalia',
  'content': ['somaliaa',
   'offici',
   'feder',
   'republ',
   'somaliab',
   'easternmost',
   'countri',
   'continent',
   'africa',
   'countri',
   'locat',
   'horn',
   'africa',
   'border',
   'ethiopia',
   'west',
   'djibouti14',
   'northwest',
   'kenya',
   'southwest',
   'gulf',
   'aden',
   'north',
   'indian',
   'ocean',
   'east',
   'somalia',
   'longest',
   'coastlin',
   'africa',
   'mainland15',
   'somalia',
   'estim',
   'popul',
   '181',
   'million161718',
   '27',
   'million',
   'live',
   'capit',
   'largest',
   'citi',
   'mogadishu',
   'around',
   '85',
   'resid',
   'ethnic',
   'somali',
   'offici',
   'languag',
   'countri',
   'somali',
   'arab',
   'though',
   'former',
   'primari',
   'languag',
   'somalia',
   'histor',
   'religi',
   'tie',
   'arab',
   'world19',
   'peopl',
   'somalia',
   'muslims20',
   'major',
   'sunni21',
   'antiqu',
   'somalia',
   'import',
   'commerci',
   'center222

In [11]:
# Initialize an inverted index
inverted_index = defaultdict(list)

# Populate the inverted index
for doc_id, doc in enumerate(processed_documents):
    for term in doc["content"]:
        if doc_id not in inverted_index[term]:
            inverted_index[term].append(doc_id)

# Save the inverted index to a JSON file
with open("inverted_index.json", "w") as f:
    json.dump(inverted_index, f)

# Display inverted index for verification
dict(list(inverted_index.items())[:10])  # Show first 10 terms in the index as an example

{'somaliaa': [0],
 'offici': [0],
 'feder': [0],
 'republ': [0],
 'somaliab': [0],
 'easternmost': [0],
 'countri': [0],
 'continent': [0],
 'africa': [0],
 'locat': [0]}

In [13]:
# Query Parsing with Boolean Logic (AND, OR, NOT)
def parse_query(query):
    terms = query.lower().split()
    tokens = []
    operators = {"and", "or", "not"}
    
    for term in terms:
        if term in operators:
            tokens.append(term)
        else:
            processed_term = stemmer.stem(term) if term not in stop_words else ""
            if processed_term:
                tokens.append(processed_term)
    
    return tokens


In [15]:
# Boolean αναζήτηση στα έγγραφα με βάση το επεξεργασμένο ερώτημα
def boolean_search(query_tokens):
    # Χρησιμοποιούμε ένα σύνολο για τα αποτελέσματα
    result_set = set(range(len(processed_documents)))  # ξεκινάμε με όλα τα έγγραφα
    
    current_operation = "and"
    
    for token in query_tokens:
        if token in {"and", "or", "not"}:
            current_operation = token
        else:
            # Τα έγγραφα που περιέχουν το τρέχον token
            matching_docs = set(inverted_index.get(token, []))
            
            # Boolean λογική
            if current_operation == "and":
                result_set &= matching_docs
            elif current_operation == "or":
                result_set |= matching_docs
            elif current_operation == "not":
                result_set -= matching_docs
    
    return list(result_set)

# Παράδειγμα εκτέλεσης Boolean αναζήτησης
query = "Artificial AND Intelligence OR Data NOT Science"
query_tokens = parse_query(query)
boolean_results = boolean_search(query_tokens)
print("Boolean Search Results Document IDs:", boolean_results)


Boolean Search Results Document IDs: [0]


In [17]:
# Συνάρτηση για υπολογισμό TF-IDF βαθμολογίας
def compute_tf_idf(term, doc_id):
    term_frequency = processed_documents[doc_id]["content"].count(term) / len(processed_documents[doc_id]["content"])
    document_frequency = len(inverted_index.get(term, []))
    inverse_document_frequency = math.log(len(processed_documents) / (1 + document_frequency))
    return term_frequency * inverse_document_frequency

# Κατάταξη αποτελεσμάτων με TF-IDF
def rank_results_tf_idf(query_tokens, result_docs):
    doc_scores = {}
    
    for doc_id in result_docs:
        score = 0
        for term in query_tokens:
            if term not in {"and", "or", "not"}:
                score += compute_tf_idf(term, doc_id)
        doc_scores[doc_id] = score

    ranked_results = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_results

# Παράδειγμα εκτέλεσης TF-IDF κατάταξης
ranked_tf_idf_results = rank_results_tf_idf(query_tokens, boolean_results)
print("TF-IDF Ranked Results (Document ID and Score):", ranked_tf_idf_results)


TF-IDF Ranked Results (Document ID and Score): [(0, -0.0003074846093201487)]


In [19]:
# Σύνθεση τελικής συνάρτησης αναζήτησης με επιλογή αλγορίθμου
def search_documents(query, algorithm="tf-idf"):
    query_tokens = parse_query(query)
    result_docs = boolean_search(query_tokens)
    
    if algorithm == "boolean":
        return result_docs
    elif algorithm == "tf-idf":
        return rank_results_tf_idf(query_tokens, result_docs)
    else:
        print("Unsupported algorithm selected.")
        return []

# Παράδειγμα αναζήτησης με επιλογή αλγορίθμου
query = "Artificial AND Intelligence OR Data NOT Science"
algorithm = "tf-idf"  # Επιλογές: "boolean" ή "tf-idf"
results = search_documents(query, algorithm)

print("Search Results using algorithm:", algorithm)
for doc_id, score in results:
    print(f"Document: {documents[doc_id]['title']} - Score: {score:.4f}")


Search Results using algorithm: tf-idf
Document: Somalia - Score: -0.0003


In [27]:
# Τελική εκτέλεση με είσοδο χρήστη για ερώτημα και αλγόριθμο
user_query = input("Enter your search query: ")
user_algorithm = input("Choose retrieval algorithm ('boolean' or 'tf-idf'): ").strip().lower()

# Εκτέλεση αναζήτησης με βάση το ερώτημα και τον επιλεγμένο αλγόριθμο
search_results = search_documents(user_query, user_algorithm)

# Εμφάνιση αποτελεσμάτων
if user_algorithm == "boolean":
    print("Boolean Search Results (Document IDs):", search_results)
else:
    print("Ranked Search Results (Document ID and Score):")
    for doc_id, score in search_results:
        print(f"Document: {documents[doc_id]['title']} - Score: {score:.4f}")


Enter your search query:  Italy and War
Choose retrieval algorithm ('boolean' or 'tf-idf'):  boolean


Boolean Search Results (Document IDs): [0]
