In [1]:
# Install required packages
!pip install pandas numpy nltk

import pandas as pd
import numpy as np
import re
import math
import zipfile
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# ========== USER INPUT ==========
upload_path = '/content/NLP.zip'
# =================================

# Step 1: Unzip the file
extract_path = '/content/extracted_books'

try:
    with zipfile.ZipFile(upload_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Successfully extracted to {extract_path}")
except FileNotFoundError:
    print(f"Error: File not found at {upload_path}")
    exit()
except Exception as e:
    print(f"Extraction error: {str(e)}")
    exit()

Successfully extracted to /content/extracted_books


In [3]:
# Step 2: Find and read the CSV
csv_file = None
for root, _, files in os.walk(extract_path):
    for file in files:
        if file.endswith('.csv'):
            csv_file = os.path.join(root, file)
            break
    if csv_file: break

if not csv_file:
    print("CSV file not found in the archive. Available files:")
    for root, _, files in os.walk(extract_path):
        for file in files:
            print(f"- {file}")
    exit()

# Load dataset with error handling
try:
    df = pd.read_csv(csv_file)
    print("\nDataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print("Columns:", df.columns.tolist())
except Exception as e:
    print(f"Error loading CSV: {str(e)}")
    exit()


Dataset loaded successfully!
Shape: (103082, 7)
Columns: ['Title', 'Authors', 'Description', 'Category', 'Publisher', 'Publish Date', 'Price']


In [4]:
# Step 3: Configure book-specific columns
text_columns = ['Title', 'Description', 'Authors', 'Publisher']  # Common book dataset columns
available_cols = [col for col in text_columns if col in df.columns]

if len(available_cols) < 2:
    print("Required text columns not found. Available columns:")
    print(df.columns.tolist())
    exit()

title_col = available_cols[0]
desc_col = available_cols[1]
print(f"\nUsing columns: '{title_col}' (title) and '{desc_col}' (description)")


Using columns: 'Title' (title) and 'Description' (description)


In [5]:
# Step 4: Enhanced preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess(text):
    if pd.isna(text):
        return []
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

In [6]:
# Step 5: Prepare documents
documents = df[[title_col, desc_col]].dropna()
documents['text'] = documents[title_col] + ' ' + documents[desc_col]
documents['tokens'] = documents['text'].apply(preprocess)

print(f"\nProcessing {len(documents)} valid book entries")


Processing 70213 valid book entries


In [7]:
# Step 6: Build search index
inverted_index = defaultdict(set)
for doc_id, tokens in enumerate(documents['tokens']):
    for token in set(tokens):
        inverted_index[token].add(doc_id)

In [8]:
# Step 7: TF-IDF calculations
doc_freq = {term: len(docs) for term, docs in inverted_index.items()}
total_docs = len(documents)

def compute_tf_idf(tokens):
    tf = Counter(tokens)
    return {
        term: (freq * math.log(total_docs / (1 + doc_freq.get(term, 1))))
        for term, freq in tf.items()
    }

def normalize(vector):
    norm = math.sqrt(sum(v**2 for v in vector.values()))
    return {k: v/norm for k, v in vector.items()} if norm > 0 else vector

print("\nBuilding TF-IDF vectors...")
tf_idf_vectors = [normalize(compute_tf_idf(tokens)) for tokens in documents['tokens']]


Building TF-IDF vectors...


In [9]:
# Step 8: Search function
def book_search(query, top_n=5):
    query_tokens = preprocess(query)
    if not query_tokens:
        print("No valid search terms")
        return

    query_vec = normalize(compute_tf_idf(query_tokens))

    scores = []
    for doc_id, doc_vec in enumerate(tf_idf_vectors):
        common_terms = set(query_vec) & set(doc_vec)
        score = sum(query_vec[term] * doc_vec[term] for term in common_terms)
        scores.append((doc_id, score))

    top_results = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]

    print(f"\nðŸ“š Top {len(top_results)} results for '{query}':")
    for idx, (doc_id, score) in enumerate(top_results, 1):
        book = documents.iloc[doc_id]
        print(f"\n{idx}. {book[title_col]}")
        print(f"   {book[desc_col][:150]}...")
        print(f"   Score: {score:.4f}")
        print("-" * 100)

In [10]:
# Step 9: Interactive interface
print("\n Book Search Engine Ready!")
print("Type your search query (e.g., 'mystery novel' or 'science fiction')")
print("Type 'exit' to quit\n")

while True:
    try:
        query = input("Search for books: ")
        if query.lower() in ['exit', 'quit']:
            print("Happy reading! ")
            break
        book_search(query)
    except KeyboardInterrupt:
        print("\nSearch closed")
        break
    except Exception as e:
        print(f"Error: {str(e)}")


 Book Search Engine Ready!
Type your search query (e.g., 'mystery novel' or 'science fiction')
Type 'exit' to quit

Search for books: python

ðŸ“š Top 5 results for 'python':

1. Python in a Nutshell
   Ask any Python aficionado and you'll hear that Python programmers have it all: an elegant language that offers object-oriented programming support, a ...
   Score: 0.7946
----------------------------------------------------------------------------------------------------

2. Monty Python's Tunisian Holiday: My Life with Brian
   â€œOne of the finest and most accurate records of the making of the film that I have ever read. I just wished I could remember what actually went on the...
   Score: 0.6025
----------------------------------------------------------------------------------------------------

3. Monty Python Live!
   A Private Word to the Reader You are currently holding in your hand . . . . . . well, actually, God knows what you are holding in your hand, you are a...
   Score: 0