# Information Retrieval System Using NLP
This notebook demonstrates the creation of a search engine, including text preprocessing, indexing, and document retrieval with ranking methods.

## Step 1: Data Collection
We collect Wikipedia articles using either a web crawler or a pre-existing dataset. Below is the code for collecting and saving the dataset.

In [4]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_wikipedia(url, max_articles=10):
    articles = {}
    visited_urls = set()
    to_visit = [url]

    while to_visit and len(articles) < max_articles:
        current_url = to_visit.pop(0)
        if current_url in visited_urls:
            continue

        response = requests.get(current_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title and content
        title = soup.find('h1').text
        paragraphs = soup.find_all('p')
        content = " ".join([p.text for p in paragraphs])

        # Save the article
        articles[title] = content
        visited_urls.add(current_url)

        # Find additional links
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and ':' not in href:
                full_url = f"https://en.wikipedia.org{href}"
                if full_url not in visited_urls:
                    to_visit.append(full_url)

    # Save articles to JSON
    with open('wikipedia_articles.json', 'w', encoding='utf-8') as f:
        json.dump(articles, f, ensure_ascii=False, indent=2)

scrape_wikipedia("https://en.wikipedia.org/wiki/Natural_language_processing")



## Step 2: Text Preprocessing
Text preprocessing involves cleaning and tokenizing the text, removing stopwords, and applying lemmatization.

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import json

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Lowercasing and removing special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenization
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

def preprocess_articles(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = json.load(f)

    preprocessed_articles = {}
    for title, content in articles.items():
        preprocessed_articles[title] = preprocess_text(content)

    with open('preprocessed_articles.json', 'w', encoding='utf-8') as f:
        json.dump(preprocessed_articles, f, ensure_ascii=False, indent=2)

preprocess_articles('wikipedia_articles.json')


[nltk_data] Downloading package punkt to /home/teo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/teo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Step 3: Indexing
We create an inverted index for efficient term-document mapping.

In [9]:
import json

def build_inverted_index(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = json.load(f)

    inverted_index = {}
    for doc_id, tokens in articles.items():
        for token in tokens:
            if token not in inverted_index:
                inverted_index[token] = []
            inverted_index[token].append(doc_id)

    with open('inverted_index.json', 'w', encoding='utf-8') as f:
        json.dump(inverted_index, f, ensure_ascii=False, indent=2)

build_inverted_index('preprocessed_articles.json')


## Step 4: Query Processing
Users can input queries to retrieve documents using Boolean Retrieval, TF-IDF, or BM25.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from rank_bm25 import BM25Okapi
import json

def search_engine():
    with open('preprocessed_articles.json', 'r', encoding='utf-8') as f:
        articles = json.load(f)

    corpus = [" ".join(tokens) for tokens in articles.values()]
    titles = list(articles.keys())

    # TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # BM25 Model
    bm25 = BM25Okapi([tokens for tokens in articles.values()])

    print("Enter your query (or 'exit' to quit):")
    while True:
        query = input("Query: ").lower()
        if query == 'exit':
            break

        print("Select ranking method:")
        print("1. Boolean Search")
        print("2. TF-IDF Search")
        print("3. BM25 Search")
        method = int(input("Choice: "))

        if method == 1:
            query_tokens = query.split()
            results = [doc for doc in titles if all(token in articles[doc] for token in query_tokens)]
            print(f"Boolean Search Results: {results}")

        elif method == 2:
            query_vector = vectorizer.transform([query])
            scores = (tfidf_matrix @ query_vector.T).toarray().flatten()
            ranked_indices = np.argsort(-scores)
            print("TF-IDF Ranked Results:")
            for idx in ranked_indices[:5]:
                print(f"{titles[idx]} (Score: {scores[idx]:.4f})")

        elif method == 3:
            query_tokens = query.split()
            scores = bm25.get_scores(query_tokens)
            ranked_indices = np.argsort(-scores)
            print("BM25 Ranked Results:")
            for idx in ranked_indices[:5]:
                print(f"{titles[idx]} (Score: {scores[idx]:.4f})")

search_engine()


Enter your query (or 'exit' to quit):


KeyboardInterrupt: Interrupted by user

## Step 5: Evaluation
We evaluate the system using metrics like Precision, Recall, and NDCG.

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_system(true_relevant_docs, retrieved_docs):
    true_positive = len(set(true_relevant_docs) & set(retrieved_docs))
    false_positive = len(set(retrieved_docs) - set(true_relevant_docs))
    false_negative = len(set(true_relevant_docs) - set(retrieved_docs))

    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

# Example evaluation
true_relevant_docs = ['Article 1', 'Article 2']
retrieved_docs = ['Article 2', 'Article 3']
evaluate_system(true_relevant_docs, retrieved_docs)


from sklearn.metrics import ndcg_score

# Example: Evaluating NDCG
true_relevance = [[1, 0, 0, 0, 1]]  # Binary relevance
predicted_scores = [[0.9, 0.7, 0.3, 0.2, 0.8]]  # Model scores
ndcg = ndcg_score(true_relevance, predicted_scores)
print(f"NDCG Score: {ndcg:.4f}")


Precision: 0.5000
Recall: 0.5000
F1-Score: 0.5000
NDCG Score: 1.0000
