# Generative vs. Geometric: A Comparative Analysis

This notebook implements and compares two information retrieval models on the **Cranfield Dataset**:
1.  **Vector Space Model (VSM):** Using TF-IDF and Cosine Similarity.
2.  **Language Model (LM):** Using Dirichlet Smoothing.

Dataset: **Cranfield** (Aerodynamics)

In [1]:
import os
import re
import numpy as np
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Configuration and Setup

In [2]:
# --- Configuration ---
DATA_DIR = os.path.join(os.getcwd(), 'cran_data')
FILES = {
    'docs': os.path.join(DATA_DIR, 'cran.all.1400'),
    'queries': os.path.join(DATA_DIR, 'cran.qry'),
    'rels': os.path.join(DATA_DIR, 'cranqrel')
}
RESULT_FILE = 'cran_result/cran_result.txt'
SAMPLE_RESULT_FILE = 'cran_result/cran_sample_result.txt'

## Phase A: Preprocessing (The Parser)
We handle the specific format of Cranfield files (.I, .T, .W, .A).

In [3]:
class CranfieldParser:
    def parse_docs(self, file_path):
        docs = {}
        doc_id = None
        content = []
        capture = False
        
        with open(file_path, 'r') as f:
            lines = f.readlines()
            
        for line in lines:
            line = line.strip()
            if line.startswith('.I'):
                if doc_id:
                    docs[doc_id] = " ".join(content)
                doc_id = int(line.split()[1])
                content = []
                capture = False
            elif line.startswith('.T'):
                capture = True
                if len(line) > 3: content.append(line[3:].strip())
            elif line.startswith('.W'):
                capture = True
            elif line.startswith('.'):
                capture = False
            elif capture and doc_id:
                content.append(line)
        
        if doc_id:
            docs[doc_id] = " ".join(content)
        return docs

    def parse_titles(self, file_path):
        titles = {}
        doc_id = None
        title_content = []
        capture_title = False
        
        with open(file_path, 'r') as f:
            lines = f.readlines()
            
        for line in lines:
            line = line.strip()
            if line.startswith('.I'):
                if doc_id and title_content:
                    titles[doc_id] = " ".join(title_content)
                doc_id = int(line.split()[1])
                title_content = []
                capture_title = False
            elif line.startswith('.T'):
                capture_title = True
                if len(line) > 3: title_content.append(line[3:].strip())
            elif line.startswith('.W') or (line.startswith('.') and not line.startswith('.T')):
                capture_title = False
            elif capture_title and doc_id:
                title_content.append(line)
        
        if doc_id and title_content:
            titles[doc_id] = " ".join(title_content)
        return titles

    def parse_queries(self, file_path):
        queries = {}
        qid = None
        content = []
        capture = False
        
        with open(file_path, 'r') as f:
            lines = f.readlines()
            
        for line in lines:
            line = line.strip()
            if line.startswith('.I'):
                if qid:
                    queries[qid] = " ".join(content)
                qid = int(line.split()[1])
                content = []
                capture = False
            elif line.startswith('.W'):
                capture = True
            elif line.startswith('.'):
                capture = False
            elif capture and qid:
                content.append(line)
        
        if qid:
             queries[qid] = " ".join(content)
        return queries

    def parse_rels(self, file_path):
        rels = defaultdict(set)
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if not parts: continue
                try:
                    qid = int(parts[0])
                    doc_id = int(parts[1])
                    rel = int(parts[2])
                    if rel in [1, 2, 3, 4]:
                        rels[qid].add(doc_id)
                except (ValueError, IndexError):
                    continue
        return rels

## Phase B: Retrieval Models - Vector Space

In [4]:
class VectorSpaceModel:
    def __init__(self, docs):
        self.doc_ids = list(docs.keys())
        self.corpus = list(docs.values())
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.vectorizer.fit_transform(self.corpus)

    def retrieve(self, query):
        query_vec = self.vectorizer.transform([query])
        scores = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
        ranked_indices = scores.argsort()[::-1]
        return [(self.doc_ids[i], scores[i]) for i in ranked_indices[:100]]

## Phase B: Retrieval Models - Language Model

In [5]:
class LanguageModel:
    def __init__(self, docs):
        self.doc_ids = list(docs.keys())
        self.docs = docs
        self.vocab = set()
        self.doc_term_counts = {}
        self.coll_term_counts = Counter()
        self.coll_length = 0
        
        print(f"Building LM Index for {len(docs)} docs...")
        for doc_id, text in docs.items():
            terms = self._tokenize(text)
            term_counts = Counter(terms)
            self.doc_term_counts[doc_id] = term_counts
            self.coll_term_counts.update(term_counts)
            self.coll_length += len(terms)
            self.vocab.update(terms)

    def _tokenize(self, text):
        return re.findall(r'\w+', text.lower())

    def retrieve(self, query, mu=500):
        query_terms = self._tokenize(query)
        scores = []
        
        coll_probs = {}
        for term in query_terms:
            if term in self.vocab:
                coll_probs[term] = self.coll_term_counts[term] / self.coll_length
            else:
                coll_probs[term] = 0.0

        for doc_id in self.doc_ids:
            score = 0
            doc_len = sum(self.doc_term_counts[doc_id].values())
            if doc_len == 0: doc_len = 0.0001 

            for term in query_terms:
                if term not in self.vocab: continue
                
                tf = self.doc_term_counts[doc_id][term]
                cf_prob = coll_probs[term]
                
                numerator = tf + mu * cf_prob
                denominator = doc_len + mu
                
                if numerator > 0:
                    score += np.log(numerator / denominator)
                else:
                    score += -100

            scores.append((doc_id, score))
            
        return sorted(scores, key=lambda x: x[1], reverse=True)[:100]

## Phase C: Evaluation Metrics

In [6]:
def calculate_map(retrieved, relevant):
    if not relevant: return 0.0
    score = 0.0
    hits = 0.0
    for i, (doc_id, _) in enumerate(retrieved):
        if doc_id in relevant:
            hits += 1.0
            score += hits / (i + 1)
    return score / len(relevant)

def calculate_p10(retrieved, relevant):
    if not relevant: return 0.0
    hits = sum(1 for doc_id, _ in retrieved[:10] if doc_id in relevant)
    return hits / 10.0

## Phase D: Main Execution & Analysis

In [7]:
# Execution
if __name__ == "__main__":
    parser = CranfieldParser()
    print("Parsing Cranfield Dataset...")
    docs = parser.parse_docs(FILES['docs'])
    titles = parser.parse_titles(FILES['docs'])
    queries = parser.parse_queries(FILES['queries'])
    rels = parser.parse_rels(FILES['rels'])
    
    print(f"Loaded {len(docs)} documents, {len(queries)} queries, {len(rels)} relevance sets.")
    
    print("\nInitializing Models...")
    vsm = VectorSpaceModel(docs)
    lm_model = LanguageModel(docs)

    print("\nRunning Retrieval...")
    vsm_map_scores = []
    vsm_p10_scores = []
    lm_map_scores = []
    lm_p10_scores = []
    sorted_qids = sorted(queries.keys())
    
    results_header = f"{'Model':<10} | {'Query ID':<10} | {'Doc ID':<10} | {'Title':<60} | {'Query':<50}\n"
    separator = "-" * 150 + "\n"

    # Open both result files
    with open(RESULT_FILE, 'w', encoding='utf-8') as f, open(SAMPLE_RESULT_FILE, 'w', encoding='utf-8') as sample_f:
        f.write(results_header)
        f.write(separator)
        
        for i, qid in enumerate(sorted_qids):
            query_text = queries[qid]
            relevant = rels.get(qid, set())
            
            # --- VSM ---
            results_vsm = vsm.retrieve(query_text)
            vsm_map_scores.append(calculate_map(results_vsm, relevant))
            vsm_p10_scores.append(calculate_p10(results_vsm, relevant))
            
            # Write Result Table
            for rank, (doc_id, score) in enumerate(results_vsm[:10]):
                title = titles.get(doc_id, "Unknown Title").strip().replace('\n', ' ')
                q_text_san = query_text.strip().replace('\n', ' ')
                # Truncate
                max_len = 48
                if len(q_text_san) > max_len:
                    q_display = q_text_san[:max_len-3] + "..."
                else:
                    q_display = q_text_san
                
                f.write(f"{'VSM':<10} | {qid:<10} | {doc_id:<10} | {title[:58]:<60} | {q_display:<50}\n")
            
            # --- LM (Mu=500) ---
            results_lm = lm_model.retrieve(query_text, mu=500)
            lm_map_scores.append(calculate_map(results_lm, relevant))
            lm_p10_scores.append(calculate_p10(results_lm, relevant))
            
            for rank, (doc_id, score) in enumerate(results_lm[:10]):
                title = titles.get(doc_id, "Unknown Title").strip().replace('\n', ' ')
                q_text_san = query_text.strip().replace('\n', ' ')
                # Truncate
                max_len = 48
                if len(q_text_san) > max_len:
                    q_display = q_text_san[:max_len-3] + "..."
                else:
                    q_display = q_text_san
                    
                f.write(f"{'LM':<10} | {qid:<10} | {doc_id:<10} | {title[:58]:<60} | {q_display:<50}\n")

            # --- Sample Output (First 5 queries, Top 1 doc for each model) ---
            if i < 5:
                # VSM Sample
                if results_vsm:
                    doc_id = results_vsm[0][0]
                    content = docs.get(doc_id, "Content not found.").strip()
                    sample_f.write(f"Model: VSM\n")
                    sample_f.write(f"query: {query_text.strip()}\n")
                    sample_f.write(f"Retrieved Document ID: {doc_id}\n")
                    sample_f.write(f"Content: {content}\n")
                    sample_f.write("-" * 20 + "\n")

                # LM Sample
                if results_lm:
                    doc_id = results_lm[0][0]
                    content = docs.get(doc_id, "Content not found.").strip()
                    sample_f.write(f"Model: The Language Model\n")
                    sample_f.write(f"query: {query_text.strip()}\n")
                    sample_f.write(f"Retrieved Document ID: {doc_id}\n")
                    sample_f.write(f"Content: {content}\n")
                    sample_f.write("=" * 50 + "\n\n")
    
    print(f"\n" + "="*50)
    print(f"       EVALUATION RESULTS        ")
    print(f"="*50)
    print(f"{'Model':<20} | {'MAP':<10} | {'P@10':<10}")
    print("-" * 45)
    print(f"{'TF-IDF (VSM)':<20} | {np.mean(vsm_map_scores):.4f}     | {np.mean(vsm_p10_scores):.4f}")
    print(f"{'Dirichlet LM':<20} | {np.mean(lm_map_scores):.4f}     | {np.mean(lm_p10_scores):.4f}")
    print("="*50)
    print(f"Detailed results saved to {RESULT_FILE}")
    print(f"Sample detailed output saved to {SAMPLE_RESULT_FILE}")

Parsing Cranfield Dataset...
Loaded 1400 documents, 225 queries, 225 relevance sets.

Initializing Models...
Building LM Index for 1400 docs...

Running Retrieval...

       EVALUATION RESULTS        
Model                | MAP        | P@10      
---------------------------------------------
TF-IDF (VSM)         | 0.0064     | 0.0093
Dirichlet LM         | 0.0049     | 0.0080
Detailed results saved to cran_result/cran_result.txt
Sample detailed output saved to cran_result/cran_sample_result.txt
