In [6]:
!pip install --user nltk

# !pip install lxml

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
from lxml import etree

def parse_xml(xml_file):
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    documents = []
    for doc in root.findall('.//doc'):  # Look for all <doc> elements
        docno = doc.find('.//docno').text
        title = doc.find('.//title').text
        author = doc.find('.//author').text
        bib = doc.find('.//bib').text
        text = doc.find('.//text').text
        documents.append((docno, title, author, bib, text))
    return documents


# Example usage
documents = parse_xml("cran.all.1400.xml")
print("len(documents):", len(documents))
for row in documents:
    if row[4] is None:
        print(row)
    
documents = [row for row in documents if row[4] is not None]

print("len(documents):", len(documents))
print(documents[:5])  # Display first 5 documents for inspection


len(documents): 1400
('471', None, None, None, None)
('995', None, None, None, None)
len(documents): 1398
[('1', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .', 'brenckman,m.', 'j. ae. scs. 25, 1958, 324.', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was f

In [10]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')  # Download WordNet data


nltk.download('stopwords')  # Download stopwords list

import re
from nltk.tokenize import word_tokenize


def preprocess_text(text):
    # Remove newline characters and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Replace all kinds of whitespace with a single space
    text = text.strip()  # Remove leading/trailing whitespace
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and tokenize
    text = re.sub(r'[^\w\s]', '', text)  # Keep only alphanumeric and whitespace characters
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming
    stemmer = PorterStemmer()
    filtered_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # lemmatizer = WordNetLemmatizer()
    # filtered_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return filtered_tokens


# Example usage
example_text = "The quick brown fox is running fast but it looks like he is flying for fuck sake!"
processed_text = preprocess_text(example_text)
print(processed_text)


['quick', 'brown', 'fox', 'run', 'fast', 'look', 'like', 'fli', 'fuck', 'sake']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
from collections import defaultdict

def build_inverted_index(documents):
    inverted_index = defaultdict(list)  # {term: [(doc_id, freq), ...]}
    
    for doc_id, title, author, bib, text in documents:
        if not text:  # Skip documents without text
            continue
        else:
            processed_text = preprocess_text(text)
            term_freq = defaultdict(int)
            
            # Count term frequency in the document
            for term in processed_text:
                term_freq[term] += 1
            
            # Add term and frequency to the inverted index
            for term, freq in term_freq.items():
                inverted_index[term].append((doc_id, freq))
    
    return inverted_index

inverted_index = build_inverted_index(documents)
print(dict(list(inverted_index.items())[:5]))  # Display first 5 terms in the index


{'experiment': [('1', 2), ('11', 1), ('12', 1), ('16', 1), ('17', 1), ('19', 1), ('25', 1), ('29', 1), ('30', 2), ('35', 1), ('37', 1), ('41', 1), ('43', 1), ('47', 1), ('52', 1), ('53', 1), ('58', 1), ('69', 1), ('70', 1), ('74', 1), ('78', 2), ('84', 2), ('99', 2), ('101', 1), ('103', 1), ('112', 1), ('115', 1), ('121', 1), ('123', 3), ('131', 1), ('137', 1), ('140', 1), ('142', 1), ('154', 1), ('156', 1), ('167', 1), ('168', 1), ('170', 1), ('171', 2), ('173', 2), ('176', 1), ('179', 2), ('183', 1), ('184', 1), ('186', 3), ('187', 1), ('188', 1), ('189', 1), ('191', 1), ('195', 3), ('197', 2), ('202', 1), ('203', 1), ('206', 2), ('207', 2), ('212', 1), ('216', 1), ('220', 1), ('222', 1), ('225', 2), ('227', 1), ('230', 1), ('234', 4), ('245', 1), ('251', 1), ('256', 2), ('257', 1), ('262', 1), ('271', 2), ('273', 1), ('277', 1), ('282', 1), ('283', 1), ('286', 1), ('287', 1), ('289', 1), ('294', 1), ('295', 1), ('304', 1), ('307', 1), ('329', 2), ('330', 2), ('334', 2), ('338', 1), 

In [43]:
# Convert list of tuples to dictionary
def transform_index(inverted_index):
    new_index = {}
    for term, postings in inverted_index.items():
        new_index[term] = {doc_id: freq for doc_id, freq in postings}
    return new_index

# Example usage
inverted_index = transform_index(inverted_index)
print(dict(list(inverted_index.items())[:5]))

{'experiment': {'1': 2, '11': 1, '12': 1, '16': 1, '17': 1, '19': 1, '25': 1, '29': 1, '30': 2, '35': 1, '37': 1, '41': 1, '43': 1, '47': 1, '52': 1, '53': 1, '58': 1, '69': 1, '70': 1, '74': 1, '78': 2, '84': 2, '99': 2, '101': 1, '103': 1, '112': 1, '115': 1, '121': 1, '123': 3, '131': 1, '137': 1, '140': 1, '142': 1, '154': 1, '156': 1, '167': 1, '168': 1, '170': 1, '171': 2, '173': 2, '176': 1, '179': 2, '183': 1, '184': 1, '186': 3, '187': 1, '188': 1, '189': 1, '191': 1, '195': 3, '197': 2, '202': 1, '203': 1, '206': 2, '207': 2, '212': 1, '216': 1, '220': 1, '222': 1, '225': 2, '227': 1, '230': 1, '234': 4, '245': 1, '251': 1, '256': 2, '257': 1, '262': 1, '271': 2, '273': 1, '277': 1, '282': 1, '283': 1, '286': 1, '287': 1, '289': 1, '294': 1, '295': 1, '304': 1, '307': 1, '329': 2, '330': 2, '334': 2, '338': 1, '339': 1, '344': 2, '345': 1, '346': 3, '347': 1, '354': 1, '360': 1, '369': 1, '370': 1, '372': 2, '377': 1, '397': 1, '409': 1, '411': 2, '413': 2, '418': 1, '420': 1

In [None]:
import math

def compute_tf_idf(inverted_index, documents):
    """Compute the TF-IDF matrix from an inverted index."""
    N = len(documents)  # Total number of documents
    tf_idf = {}  # Store TF-IDF scores
    
    # Create a dictionary for document lengths (total terms in each document)
    doc_lengths = {doc[0]: len(doc[4].split()) for doc in documents}
    print("doc lengths:", doc_lengths)
    
    for term, doc_freqs in inverted_index.items():
        df = len(doc_freqs)  # Number of documents containing the term
        if df == 0:
            print("df:", df)
        
        idf = math.log(N / df) if df > 0 else 0  # Compute IDF
        
        for doc_id, freq in doc_freqs.items():
            # Retrieve the document length for normalization
            doc_length = doc_lengths[doc_id]
            tf = freq / doc_length  # Normalize TF (term frequency)
            tf_idf_score = tf * idf  # Compute TF-IDF
            
            if doc_id not in tf_idf:
                tf_idf[doc_id] = {}
            tf_idf[doc_id][term] = tf_idf_score
    
    return tf_idf

tf_idf = compute_tf_idf(inverted_index, documents)
print(dict(list(tf_idf.items())[:5]))  # Display first 5 documents in the TF-IDF matrix

doc lengths: {'1': 143, '2': 199, '3': 26, '4': 78, '5': 55, '6': 106, '7': 220, '8': 164, '9': 338, '10': 54, '11': 109, '12': 129, '13': 144, '14': 375, '15': 144, '16': 143, '17': 144, '18': 132, '19': 68, '20': 164, '21': 61, '22': 90, '23': 141, '24': 272, '25': 380, '26': 61, '27': 140, '28': 164, '29': 253, '30': 116, '31': 37, '32': 183, '33': 271, '34': 176, '35': 157, '36': 143, '37': 164, '38': 84, '39': 161, '40': 159, '41': 73, '42': 276, '43': 150, '44': 290, '45': 164, '46': 91, '47': 203, '48': 122, '49': 403, '50': 153, '51': 208, '52': 187, '53': 213, '54': 217, '55': 141, '56': 222, '57': 179, '58': 183, '59': 236, '60': 140, '61': 136, '62': 292, '63': 138, '64': 151, '65': 85, '66': 166, '67': 90, '68': 114, '69': 135, '70': 206, '71': 81, '72': 256, '73': 347, '74': 87, '75': 115, '76': 194, '77': 356, '78': 199, '79': 169, '80': 292, '81': 117, '82': 317, '83': 316, '84': 179, '85': 287, '86': 116, '87': 117, '88': 140, '89': 440, '90': 108, '91': 157, '92': 200,

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

def preprocess_query(query):
    """Tokenizes and stems the query."""
    tokens = word_tokenize(query.lower())  # Lowercasing + tokenization
    stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Apply stemming
    return stemmed_tokens

In [65]:
import numpy as np

def compute_cosine_similarity(tf_idf, query, inverted_index, N):
    """Compute cosine similarity between query and documents."""
    query_tf_idf = {}

    # Compute TF-IDF for query using the same IDF values as the document matrix
    for term in query:
        if term in inverted_index:
            df = len(inverted_index[term])  # Document frequency
            idf = math.log(N / df) if df > 0 else 0  # Compute IDF
            tf = query.count(term) / len(query)  # Query term frequency
            query_tf_idf[term] = tf * idf  # TF-IDF for query

    # Compute cosine similarity for each document
    scores = {}
    for doc_id, doc_vector in tf_idf.items():
        doc_norm = np.linalg.norm(list(doc_vector.values()))  # Document vector norm
        query_norm = np.linalg.norm(list(query_tf_idf.values()))  # Query vector norm

        # Compute dot product
        dot_product = sum(doc_vector.get(term, 0) * query_tf_idf.get(term, 0) for term in query_tf_idf)

        # Compute cosine similarity
        if doc_norm > 0 and query_norm > 0:
            scores[doc_id] = dot_product / (doc_norm * query_norm)
        else:
            scores[doc_id] = 0

    # Rank documents by score
    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_docs

# Example usage:
query = "flying"

# preprocess query
query = preprocess_text(query)

ranked_results = compute_cosine_similarity(tf_idf, query, inverted_index, len(documents))
print(ranked_results[:10])  # Show top 10 ranked documents

[('700', 0.33355946436282313), ('253', 0.22493878213042237), ('468', 0.1413552352404559), ('884', 0.13800794167915462), ('1323', 0.11416879992775769), ('1297', 0.11231553782254357), ('1147', 0.10872622426098287), ('1000', 0.1084048079348431), ('76', 0.1008002380149461), ('933', 0.08227598328977229)]


In [68]:
# Create a lookup dictionary for document titles and full texts
doc_lookup = {doc[0]: (doc[1], doc[4]) for doc in documents}  # { "1": ("Title 1", "Full text..."), ... }

def display_results(ranked_results, doc_lookup, top_n=10):
    """Display the top N ranked results with their titles and allow the user to read one."""
    print("\nTop Search Results:\n")
    for rank, (doc_id, score) in enumerate(ranked_results[:top_n], start=1):
        title = doc_lookup.get(doc_id, ("Unknown Title", ""))[0]
        print(f"{rank}. [{doc_id}] {title} (Score: {score:.4f})")

    # Ask user if they want to read a document
    doc_id_to_read = input("\nEnter a document ID to read (or press Enter to skip): ").strip()
    if doc_id_to_read in doc_lookup:
        title, text = doc_lookup[doc_id_to_read]
        print(f"\n=== {title} ===\n{text[:500]}...")  # Show only first 500 characters
    else:
        print("No document selected or invalid ID.")

display_results(ranked_results, doc_lookup)


Top Search Results:

1. [700] two and three-dimensional unsteady lift problems in high speed flight . (Score: 0.3336)
2. [253] on the ground level disturbance from large aircraft
flying at supersonic speeds . (Score: 0.2249)
3. [468] a refinement of the linearised transonic flow theory . (Score: 0.1414)
4. [884] the estimation of fatigue damage on structural elements . (Score: 0.1380)
5. [1323] an investigation of the use of an auxiliary slot to
re-establish laminar flow on low drag aerofoils . (Score: 0.1142)
6. [1297] ionization nonequilibrium in expanding flows . (Score: 0.1123)
7. [1147] heat transfer to bodies traveling at high speed in the upper
atmosphere . (Score: 0.1087)
8. [1000] free-flight measurements of the static and dynamic
stability and drag of a 10 blunted cone at mach numbers
3 .5 and 8 .5 . (Score: 0.1084)
9. [76] flight measurement of wall pressure fluctuations and
boundary-layer turbulence . (Score: 0.1008)
10. [933] the characteristics of roughness from insects 

In [74]:
import xml.etree.ElementTree as ET

def parse_queries(filename):
    """Parse queries from the Cranfield XML file."""
    queries = {}
    tree = ET.parse(filename)
    root = tree.getroot()
    
    for top in root.findall("top"):
        num = top.find("num").text.strip()  # Extract query number
        title = top.find("title").text.strip()  # Extract query text
        
        if num and title:
            queries[num] = title

    return queries
    



In [75]:
vsm_results = {}

queries = parse_queries("cran.qry.xml")  # Load queries from XML file
print("len(queries):", len(queries))

for query_id, query_text in queries.items():
    query_text = preprocess_text(query_text)  # Apply the same preprocessing as indexing
    ranked_docs = compute_cosine_similarity(tf_idf, query_text, inverted_index, len(documents))
    vsm_results[query_id] = ranked_docs


len(queries): 225


In [76]:
def save_results_trec_format(ranked_results, model_name, output_file):
    """
    Save ranked results in TREC format: query_id Q0 doc_id rank score model_name
    """
    with open(output_file, 'w') as f:
        for query_id, results in ranked_results.items():
            for rank, (doc_id, score) in enumerate(results, start=1):
                f.write(f"{query_id} Q0 {doc_id} {rank} {score:.4f} {model_name}\n")

# Save VSM results
save_results_trec_format(vsm_results, "VSM", "vsm_results.txt")
