In [25]:
!pip install --user nltk

# !pip install lxml



## Retreive document

In [1]:
from lxml import etree

def parse_xml(xml_file):
    tree = etree.parse(xml_file)
    root = tree.getroot()
    
    documents = []
    for doc in root.findall('.//doc'):  # Look for all <doc> elements
        docno = doc.find('.//docno').text
        title = doc.find('.//title').text
        author = doc.find('.//author').text
        bib = doc.find('.//bib').text
        text = doc.find('.//text').text
        documents.append((docno, title, author, bib, text))
    return documents


documents = parse_xml("cran.all.1400.xml")
print("len(documents):", len(documents))
for row in documents:
    if row[4] is None:
        print(row)
    
documents = [row for row in documents if row[4] is not None]

print("len(documents):", len(documents))
print(documents[:5])  # Display first 5 documents for inspection


len(documents): 1400
('471', None, None, None, None)
('995', None, None, None, None)
len(documents): 1398
[('1', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .', 'brenckman,m.', 'j. ae. scs. 25, 1958, 324.', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was f

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### process text

In [54]:

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download WordNet data
nltk.download('stopwords')  # Download stopwords list
from nltk.tokenize import word_tokenize


def preprocess_text(text):
    # Remove newline characters and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Replace all kinds of whitespace with a single space
    text = text.strip()  # Remove leading/trailing whitespace
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)  # Keep only alphanumeric and whitespace characters
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming
    stemmer = PorterStemmer()
    filtered_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return filtered_tokens

import re

def preprocess_text_light(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize into words
    words = text.split()
    return words

example_text = "The quick brown fox is running fast but it looks like he is flying for fuck sake!"
processed_text = preprocess_text(example_text)
print(processed_text)


['quick', 'brown', 'fox', 'run', 'fast', 'look', 'like', 'fli', 'fuck', 'sake']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Create Inverted Index Matrix

In [59]:
from collections import defaultdict

def build_inverted_index(documents):
    inverted_index = defaultdict(list)  # {term: [(doc_id, freq), ...]}
    
    for doc_id, title, author, bib, text in documents:
        if not text:  # Skip documents without text
            continue
        else:
            if not author:
                author = ""
            if not bib:
                bib = ""
            processed_text = preprocess_text(title + " " + text  )  # Combine title & text
            processed_light = preprocess_text_light(author  )
            processed_text = processed_text + processed_light
            term_freq = defaultdict(int)
            
            # Count term frequency in the document
            for term in processed_text:
                term_freq[term] += 1
            
            # Add term and frequency to the inverted index
            for term, freq in term_freq.items():
                inverted_index[term].append((doc_id, freq))
    return inverted_index

inverted_index = build_inverted_index(documents)
print(dict(list(inverted_index.items())[:5]))  # Display first 5 terms in the index


{'experiment': [('1', 3), ('11', 1), ('12', 1), ('16', 1), ('17', 1), ('19', 1), ('25', 1), ('29', 1), ('30', 2), ('35', 1), ('37', 1), ('41', 1), ('43', 1), ('47', 1), ('52', 2), ('53', 1), ('58', 1), ('69', 1), ('70', 1), ('74', 2), ('78', 2), ('84', 3), ('99', 2), ('101', 1), ('103', 1), ('112', 1), ('115', 1), ('121', 1), ('123', 3), ('131', 1), ('137', 1), ('140', 1), ('142', 1), ('154', 1), ('156', 1), ('167', 1), ('168', 1), ('170', 1), ('171', 2), ('173', 2), ('176', 1), ('179', 2), ('183', 1), ('184', 1), ('186', 3), ('187', 1), ('188', 1), ('189', 2), ('191', 1), ('195', 3), ('197', 2), ('202', 1), ('203', 1), ('206', 2), ('207', 2), ('212', 1), ('216', 1), ('220', 1), ('222', 1), ('225', 2), ('227', 1), ('230', 1), ('234', 4), ('245', 1), ('251', 1), ('256', 3), ('257', 1), ('262', 1), ('271', 3), ('273', 1), ('277', 1), ('282', 1), ('283', 1), ('286', 1), ('287', 1), ('289', 1), ('294', 1), ('295', 1), ('304', 1), ('307', 1), ('329', 2), ('330', 2), ('334', 2), ('338', 1), 

In [58]:
import math

def compute_tf(freq, doc_length, max_freq = 0, method="augmented"):
    if method == "raw":
        return freq / doc_length
    elif method == "log":
        return 1 + math.log(freq) if freq > 0 else 0
    elif method == "augmented":
        return 0.5 + (0.5 * freq / max_freq)
    return 0


In [6]:
print(documents[:2])

[('1', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .', 'brenckman,m.', 'j. ae. scs. 25, 1958, 324.', 'experimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was found to agree\nwell with a potential flow theory .\n  an empirical evaluation of the destalling effects wa

## Pure VSM (not working)

# Compute TF-IDF matric for VSM

In [60]:
def calculate_tf(inverted_index, documents):
    tf = {}
    total_terms_per_doc = {}

    # Calculate total terms in each document
    for doc_id, title, author, bib, text in documents:
        # process text before operations
        if not text:  
            text = ""
        if not author:
            author = ""
        if not bib:
            bib = ""
        if not title:
            title = ""
        processed_text = preprocess_text(title + " " + text )
        processed_text_light = preprocess_text_light(author )
        processed_text = processed_text + processed_text_light
        total_terms_per_doc[doc_id] = len(processed_text)

    # Calculate TF for each term in each document
    for term, doc_freqs in inverted_index.items():
        tf[term] = {}
        for doc_id, freq in doc_freqs:
            tf[term][doc_id] = freq / total_terms_per_doc[doc_id]

    return tf

tf = calculate_tf(inverted_index, documents)
print("Term Frequency (TF):")
print(tf['1'])

Term Frequency (TF):
{'7': 0.006622516556291391, '18': 0.014492753623188406, '21': 0.025, '28': 0.010638297872340425, '40': 0.008849557522123894, '42': 0.0058823529411764705, '49': 0.004310344827586207, '59': 0.013245033112582781, '64': 0.01020408163265306, '72': 0.006211180124223602, '73': 0.005319148936170213, '77': 0.009852216748768473, '80': 0.005494505494505495, '85': 0.00625, '94': 0.004, '95': 0.04285714285714286, '100': 0.007142857142857143, '110': 0.009950248756218905, '129': 0.006535947712418301, '131': 0.005376344086021506, '138': 0.0078125, '139': 0.023809523809523808, '162': 0.008333333333333333, '163': 0.004166666666666667, '165': 0.008695652173913044, '171': 0.009174311926605505, '178': 0.021739130434782608, '190': 0.010416666666666666, '193': 0.008547008547008548, '198': 0.011428571428571429, '202': 0.011764705882352941, '213': 0.018518518518518517, '218': 0.0064516129032258064, '225': 0.008968609865470852, '234': 0.005128205128205128, '239': 0.009345794392523364, '247'

In [61]:
import math

def calculate_idf(inverted_index, documents):
    idf = {}
    total_docs = len(documents)

    for term, doc_freqs in inverted_index.items():
        # Number of documents containing the term
        doc_count = len(doc_freqs)
        idf[term] = math.log(total_docs / doc_count)

    return idf

idf = calculate_idf(inverted_index, documents)
print("Inverse Document Frequency (IDF):")
print(idf['1'])

Inverse Document Frequency (IDF):
2.136852448893175


In [62]:
def calculate_tfidf(tf, idf):
    tfidf = {}

    for term, doc_tf in tf.items():
        tfidf[term] = {}
        for doc_id, tf_value in doc_tf.items():
            tfidf[term][doc_id] = tf_value * idf[term]

    return tfidf

tfidf = calculate_tfidf(tf, idf)
print("TF-IDF Scores:")
print(tfidf['experiment'])

TF-IDF Scores:
{'1': 0.051423433461737125, '11': 0.020619057861469475, '12': 0.018476818083654464, '16': 0.017350182834651146, '17': 0.01580794436045993, '19': 0.029034999845742728, '25': 0.006437624400187303, '29': 0.009178806402847703, '30': 0.03695363616730893, '35': 0.014975947288856777, '37': 0.013296401798517698, '41': 0.030928586792204213, '43': 0.015298010671412838, '47': 0.011566788556434096, '52': 0.024529568835196445, '53': 0.010235359657851753, '58': 0.011855958270344948, '69': 0.01598556171282465, '70': 0.011028798391018555, '74': 0.04822762686242013, '78': 0.023133577112868193, '84': 0.0384517565524701, '99': 0.016737823440486986, '101': 0.007295974320212275, '103': 0.018009050537232833, '112': 0.015298010671412838, '115': 0.013173286967049941, '121': 0.016737823440486986, '123': 0.02963989567586237, '131': 0.007649005335706419, '137': 0.05081124973004977, '140': 0.007008448238627556, '142': 0.020038239330160475, '154': 0.0225827776577999, '156': 0.010090177251357402, '16

In [63]:
def construct_tfidf_matrix(tfidf, documents):
    # Get all unique terms and document IDs
    terms = list(tfidf.keys())
    doc_ids = [doc[0] for doc in documents]

    # Initialize the TF-IDF matrix
    tfidf_matrix = []

    # Fill the matrix
    for doc_id in doc_ids:
        row = []
        for term in terms:
            if doc_id in tfidf[term]:
                row.append(tfidf[term][doc_id])
            else:
                row.append(0.0)  # Term not present in the document
        tfidf_matrix.append(row)

    return tfidf_matrix, terms, doc_ids

tfidf_matrix, terms, doc_ids = construct_tfidf_matrix(tfidf, documents)
print("TF-IDF Matrix:")
print(tfidf_matrix[10])
print("Terms:")
print(terms[10])
print("Document IDs:")
print(doc_ids[10])

TF-IDF Matrix:
[0.020619057861469475, 0.0, 0.03019916845766995, 0.0, 0.0, 0.025659815204670738, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05769636216658335, 0.0, 0.0, 0.07503877022134006, 0.06412637685221217, 0.021829356888616865, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025843658096370965, 0.0, 0.06370198259004779, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013839700907709728, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.060170369122252744, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04192506515773526, 0.07369036397105332, 0.0, 0.0, 0.0, 0.03576767354641045, 0.052184933689042375, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [64]:
# print dimensions of tfidf
print(len(tfidf_matrix))
print(len(tfidf_matrix[0]))
print(len(tfidf_matrix[1]))
print(len(tfidf_matrix[5]))

1398
7913
7913
7913


In [65]:
import xml.etree.ElementTree as ET

def parse_queries(filename):
    """Parse queries from the Cranfield XML file."""
    queries = {}
    tree = ET.parse(filename)
    root = tree.getroot()
    
    for top in root.findall("top"):
        num = top.find("num").text.strip()  # Extract query number
        title = top.find("title").text.strip()  # Extract query text
        
        if num and title:
            queries[num] = title

    return queries

queries = parse_queries("cran.qry.xml")
print(queries)

{'1': 'what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .', '2': 'what are the structural and aeroelastic problems associated with flight\nof high speed aircraft .', '4': 'what problems of heat conduction in composite slabs have been solved so\nfar .', '8': 'can a criterion be developed to show empirically the validity of flow\nsolutions for chemically reacting gas mixtures based on the simplifying\nassumption of instantaneous local chemical equilibrium .', '9': 'what chemical kinetic system is applicable to hypersonic aerodynamic\nproblems .', '10': 'what theoretical and experimental guides do we have as to turbulent\ncouette flow behaviour .', '12': 'is it possible to relate the available pressure distributions for an\nogive forebody at zero angle of attack to the lower surface pressures of\nan equivalent ogive forebody at angle of attack .', '13': 'what methods -dash exact or approximate -dash are presently available\nfor predic

In [66]:

# Custom query
custom_query = "experiment and computer flying"
query_terms = preprocess_text(custom_query)
print("Preprocessed Query Terms:")
print(query_terms)

# Step 2: Calculate TF-IDF Scores for the Query
def calculate_query_tfidf_scores(query_terms, tfidf_matrix, terms, doc_ids):
    query_scores = {}  # {doc_id: score}

    for i, doc_id in enumerate(doc_ids):
        score = 0
        for term in query_terms:
            if term in terms:
                term_index = terms.index(term)
                score += tfidf_matrix[i][term_index]
        query_scores[doc_id] = score

    return query_scores

query_scores = calculate_query_tfidf_scores(query_terms, tfidf_matrix, terms, doc_ids)
print("Query Scores:")
print(query_scores)

# Step 3: Rank Documents Based on Scores
def rank_documents(query_scores):
    return sorted(query_scores.items(), key=lambda x: x[1], reverse=True)

ranked_documents = rank_documents(query_scores)
print("Ranked Documents:")
print(ranked_documents)

# Step 4: Display the Top Results
def display_top_results(ranked_documents, documents, top_k=5):
    print(f"Top {top_k} Results:")
    for doc_id, score in ranked_documents[:top_k]:
        doc_info = next(doc for doc in documents if doc[0] == doc_id)
        print(f"Document ID: {doc_id}, Score: {score:.4f}, Title: {doc_info[1]}")

# Example usage
display_top_results(ranked_documents, documents, top_k=5)

Preprocessed Query Terms:
['experi', 'comput', 'fli']
Query Scores:
{'1': 0.026733944601776857, '2': 0.0, '3': 0.0, '4': 0.0, '5': 0.0, '6': 0.0, '7': 0.014694817231440258, '8': 0.0, '9': 0.0, '10': 0.0, '11': 0.0, '12': 0.0, '13': 0.0, '14': 0.021701906851976233, '15': 0.02881710911620103, '16': 0.029641628870991928, '17': 0.024654637799416436, '18': 0.0, '19': 0.0, '20': 0.0, '21': 0.0, '22': 0.0, '23': 0.0, '24': 0.015580856201418834, '25': 0.010040350235056467, '26': 0.0, '27': 0.0, '28': 0.0, '29': 0.0, '30': 0.05763421823240206, '31': 0.0, '32': 0.023371284302128252, '33': 0.0, '34': 0.0, '35': 0.0, '36': 0.0, '37': 0.0, '38': 0.0, '39': 0.02493165620165707, '40': 0.039272874370751845, '41': 0.1447120044748356, '42': 0.0, '43': 0.0, '44': 0.0, '45': 0.0, '46': 0.0, '47': 0.0, '48': 0.03329607626604573, '49': 0.010476782618195422, '50': 0.0, '51': 0.0, '52': 0.01912859829265068, '53': 0.0, '54': 0.0, '55': 0.0, '56': 0.0, '57': 0.0, '58': 0.0, '59': 0.016096778592194293, '60': 0.0

In [67]:
def generate_trec_eval_output(queries, tfidf_matrix, terms, doc_ids, run_id="my_run", top_k=100):
    output_lines = []

    for query_id, query_text in queries.items():
        # Preprocess the query
        query_terms = preprocess_text(query_text)

        # Calculate similarity scores for the query
        query_scores = calculate_query_tfidf_scores(query_terms, tfidf_matrix, terms, doc_ids)

        # Rank documents by similarity scores (descending order)
        ranked_documents = sorted(query_scores.items(), key=lambda x: x[1], reverse=True)

        # Write the top-k results to the output
        for rank, (doc_id, similarity) in enumerate(ranked_documents[:top_k], start=1):
            line = f"{query_id} 0 {doc_id} {rank} {similarity:.6f} {run_id}"
            output_lines.append(line)

    return output_lines

# Example usage
run_id = "tfidf_run"
top_k = 100  # Number of documents to retrieve per query
output_lines = generate_trec_eval_output(queries, tfidf_matrix, terms, doc_ids, run_id, top_k)

# Save the output to a file
with open("trec_output.txt", "w") as f:
    for line in output_lines:
        f.write(line + "\n")

print("Result file generated: trec_output.txt")

Result file generated: trec_output.txt


# Consine similarity


In [44]:
import numpy as np

def compute_query_vector(query_terms, terms, idf):
    query_vector = np.zeros(len(terms))
    for term in query_terms:
        if term in terms:
            term_index = terms.index(term)
            query_vector[term_index] += 1  # Term frequency in the query
    # Multiply by IDF to get TF-IDF representation
    query_vector *= np.array(list(idf.values()))
    return query_vector

In [45]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0  # Avoid division by zero
    return dot_product / (norm1 * norm2)

In [46]:
def generate_trec_eval_output_with_cosine(queries, tfidf_matrix, terms, doc_ids, idf, run_id="my_run", top_k=100):
    output_lines = []

    for query_id, query_text in queries.items():
        # Preprocess the query
        query_terms = preprocess_text(query_text)

        # Compute query vector
        query_vector = compute_query_vector(query_terms, terms, idf)

        # Compute cosine similarity for each document
        query_scores = {}
        for i, doc_id in enumerate(doc_ids):
            doc_vector = tfidf_matrix[i]  # TF-IDF vector for the document
            similarity = cosine_similarity(query_vector, doc_vector)
            query_scores[doc_id] = similarity

        # Rank documents by cosine similarity (descending order)
        ranked_documents = sorted(query_scores.items(), key=lambda x: x[1], reverse=True)

        # Write the top-k results to the output
        for rank, (doc_id, similarity) in enumerate(ranked_documents[:top_k], start=1):
            line = f"{query_id} 0 {doc_id} {rank} {similarity:.6f} {run_id}"
            output_lines.append(line)

    return output_lines

In [47]:
# Example usage
run_id = "tfidf_cosine_run"
top_k = 100  # Number of documents to retrieve per query
output_lines = generate_trec_eval_output_with_cosine(queries, tfidf_matrix, terms, doc_ids, idf, run_id, top_k)

# Save the output to a file
with open("trec_output_cosine.txt", "w") as f:
    for line in output_lines:
        f.write(line + "\n")

print("Result file generated: trec_output_cosine.txt")

Result file generated: trec_output_cosine.txt
