# Generative vs. Geometric: A Comparative Analysis (Backup/CISI)

This notebook implements and compares two information retrieval models on the **CISI Dataset**:
1.  **Vector Space Model (Week 3):** Using TF-IDF and Cosine Similarity.
2.  **Query Likelihood Model (Week 5):** Using Dirichlet Smoothing.

Dataset: **CISI** (Computer and Information Science)

In [1]:
# importing standard stuff for file handling
import os
import re
# numpy is for the math heavy lifting
import numpy as np
from collections import defaultdict, Counter
# using sklearn for the vector magic and similarity checks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Configuration and Setup

In [None]:
# Confguring
DATA_DIR = os.path.join(os.getcwd(), 'cisi_data')
# setting up where our data lives
FILES = {
    'docs': os.path.join(DATA_DIR, 'CISI.ALL'),
    'queries': os.path.join(DATA_DIR, 'CISI.QRY'),
    'rels': os.path.join(DATA_DIR, 'CISI.REL')
}
RESULT_FILE = 'cisi_result/cisi_result.txt'
SAMPLE_RESULT_FILE = 'csi_result/cisi_sample_result.txt'

# list of boring words we dont care about
STOPWORDS = set([
    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", 
    "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", 
    "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"
])

# basic tokenizer to clean up the text junk
def tokenize(text):
    """Simple tokenizer that removes punctuation and stopwords."""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text) # Keep only alphanumeric and space
    tokens = text.split()
    return [t for t in tokens if t not in STOPWORDS]

## Phase A: Preprocessing (The Parser)

In [3]:
class CISIParser:
    # helper to read the weird cisi format
    @staticmethod
    def _parse_cisi_content(file_path, target_tags):
        with open(file_path, 'r') as f:
            content = f.read()
        
        parsed_data = {}
        items = content.split('.I ')
        
        for item in items[1:]:  # Skip empty preamble
            lines = item.split('\n')
            try:
                obj_id = int(lines[0].strip())
            except ValueError:
                continue # Skip malformed IDs
            
            collected_text = []
            current_tag = None
            
            for line in lines:
                if line.startswith('.'):
                    # Update current state (e.g., .T, .W, .A)
                    current_tag = line[:2]
                    continue
                
                # If we are currently inside one of the tags we want, keep the line
                if current_tag in target_tags:
                    collected_text.append(line)
            
            parsed_data[obj_id] = " ".join(collected_text).strip()
            
        return parsed_data

    # parsing the docs specifically looking for titles and abstracts
    @staticmethod
    def parse_docs(file_path):
        # Documents need Title (.T) and Abstract (.W)
        return CISIParser._parse_cisi_content(file_path, target_tags=['.T', '.W'])

    @staticmethod
    def parse_titles(file_path):
        # Reporting only needs Title (.T)
        return CISIParser._parse_cisi_content(file_path, target_tags=['.T'])

    # getting the queries
    @staticmethod
    def parse_queries(file_path):
        # Queries usually only have body text (.W)
        # Note: Some CISI queries have .T too, but usually .W is the core question
        return CISIParser._parse_cisi_content(file_path, target_tags=['.W'])

    # parsing the relevance judgments which is a bit different structure
    @staticmethod
    def parse_rels(file_path):
        # This format is completely different (Structure: QID DOCID ...), 
        # so it stays as its own distinct logic.
        rels = defaultdict(set)
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 2:
                    try:
                        qid = int(parts[0])
                        doc_id = int(parts[1])
                        rels[qid].add(doc_id)
                    except ValueError:
                        continue
        return rels


## Phase B: Retrieval Models - Vector Space

In [4]:
class VectorSpaceModel:
    # class for the classic vector model
    def __init__(self, docs):
        self.doc_ids = list(docs.keys())
        self.corpus = [docs[did] for did in self.doc_ids]
        # creating the tfidf matrix here
        self.vectorizer = TfidfVectorizer(
            tokenizer=tokenize, 
            stop_words=None,
            token_pattern=None
        )
        self.doc_vectors = self.vectorizer.fit_transform(self.corpus)

    def retrieve(self, query_text):
        q_vec = self.vectorizer.transform([query_text])
        # comparing the query vector with all doc vectors using cosine
        scores = cosine_similarity(q_vec, self.doc_vectors).flatten()
        ranked_indices = scores.argsort()[::-1]
        results = []
        for idx in ranked_indices:
            if scores[idx] > 0:
                results.append((self.doc_ids[idx], scores[idx]))
        return results

## Phase B: Retrieval Models - Language Model

In [5]:
class DirichletLM:
    # language model with dirichlet smoothing
    def __init__(self, docs, mu=100):
        self.mu = mu
        self.index = defaultdict(Counter)
        self.doc_lengths = {}
        self.total_corpus_terms = 0
        self.doc_ids = list(docs.keys())
        
        print(f"Building LM Index for {len(docs)} docs...")
        # building our own manual index to count stuff
        for doc_id, text in docs.items():
            tokens = tokenize(text)
            length = len(tokens)
            self.doc_lengths[doc_id] = length
            self.total_corpus_terms += length
            for t in tokens:
                self.index[t][doc_id] += 1

    def retrieve(self, query_text):
        query_tokens = tokenize(query_text)
        scores = []
        
        query_stats = {}
        for token in query_tokens:
             if token in self.index:
                 query_stats[token] = sum(self.index[token].values())

        for doc_id in self.doc_ids:
            doc_len = self.doc_lengths.get(doc_id, 0)
            if doc_len == 0:
                scores.append((doc_id, -float('inf')))
                continue
            
            score = 0.0
            for token in query_tokens:
                # calculating the score based on probability
                if token not in query_stats: continue 
                tf = self.index[token].get(doc_id, 0)
                # P(w|C)
                corpus_tf = query_stats[token]
                p_C = corpus_tf / self.total_corpus_terms
                
                # smoothing bit to handle unseen words
                numerator = tf + (self.mu * p_C)
                denominator = doc_len + self.mu
                score += np.log(numerator / denominator)
            scores.append((doc_id, score))
            
        return sorted(scores, key=lambda x: x[1], reverse=True)

## Phase C: Evaluation Metrics

In [6]:
# calculating mean average precision because it is standard
def calculate_map(retrieved, relevant):
    if not relevant: return 0.0
    score = 0.0
    hits = 0.0
    for i, (doc_id, _) in enumerate(retrieved):
        if doc_id in relevant:
            hits += 1.0
            score += hits / (i + 1)
    return score / len(relevant)

# checking how many good ones are in the top 10
def calculate_p10(retrieved, relevant):
    if not relevant: return 0.0
    hits = sum(1 for doc_id, _ in retrieved[:10] if doc_id in relevant)
    return hits / 10.0

## Phase D: Main Execution & Analysis

In [None]:
# Execution
if __name__ == "__main__":
    print("Parsing CISI Dataset...")
    # loading everything up
    docs = CISIParser.parse_docs(FILES['docs'])

    titles = CISIParser.parse_titles(FILES['docs'])
    queries = CISIParser.parse_queries(FILES['queries'])
    rels = CISIParser.parse_rels(FILES['rels'])
    
    print(f"Loaded {len(docs)} documents, {len(queries)} queries, {len(rels)} relevance sets.")
    
    print("\nInitializing Models...")
    # initializing our two contenders
    vsm = VectorSpaceModel(docs)
    lm = DirichletLM(docs, mu=4000)

    print("\nRunning Retrieval...")
    # running the retrieval loop
    vsm_map_scores, vsm_p10_scores = [], []
    lm_map_scores, lm_p10_scores = [], []
    
    active_queries = sorted([qid for qid in queries if qid in rels])
    
    results_header = f"{'Model':<10} | {'Query ID':<10} | {'Doc ID':<10} | {'Title':<60} | {'Query':<50}\n"
    separator = "-" * 150 + "\n"

    with open(RESULT_FILE, 'w', encoding='utf-8') as f, open(SAMPLE_RESULT_FILE, 'w', encoding='utf-8') as sample_f:
        f.write(results_header)
        f.write(separator)

        for i, qid in enumerate(active_queries):
            query_text = queries[qid]
            relevant = rels[qid]
            
            # --- VSM ---
            results_vsm = vsm.retrieve(query_text)
            vsm_map_scores.append(calculate_map(results_vsm, relevant))
            vsm_p10_scores.append(calculate_p10(results_vsm, relevant))
            
            for doc_id, score in results_vsm[:10]:
                title = titles.get(doc_id, "Unknown Title").strip().replace('\n', ' ')
                q_text_san = query_text.strip().replace('\n', ' ')
                # Truncate
                max_len = 48
                if len(q_text_san) > max_len:
                    q_display = q_text_san[:max_len-3] + "..."
                else:
                    q_display = q_text_san
                f.write(f"{'VSM':<10} | {qid:<10} | {doc_id:<10} | {title[:58]:<60} | {q_display:<50}\n")

            # --- LM ---
            results_lm = lm.retrieve(query_text)
            lm_map_scores.append(calculate_map(results_lm, relevant))
            lm_p10_scores.append(calculate_p10(results_lm, relevant))
            
            for doc_id, score in results_lm[:10]:
                title = titles.get(doc_id, "Unknown Title").strip().replace('\n', ' ')
                q_text_san = query_text.strip().replace('\n', ' ')
                # Truncate
                max_len = 48
                if len(q_text_san) > max_len:
                    q_display = q_text_san[:max_len-3] + "..."
                else:
                    q_display = q_text_san
                f.write(f"{'LM':<10} | {qid:<10} | {doc_id:<10} | {title[:58]:<60} | {q_display:<50}\n")

            # Samples (First 5 queries) 
            # saving specific examples to check later
            if i < 5:
                # VSM Sample
                if results_vsm:
                    doc_id = results_vsm[0][0]
                    content = docs.get(doc_id, "Content not found.").strip()
                    sample_f.write(f"Model: VSM\n")
                    sample_f.write(f"query: {query_text.strip()}\n")
                    sample_f.write(f"Retrieved Document ID: {doc_id}\n")
                    sample_f.write(f"Content: {content}\n")
                    sample_f.write("-" * 20 + "\n")
                # LM Sample
                if results_lm:
                    doc_id = results_lm[0][0]
                    content = docs.get(doc_id, "Content not found.").strip()
                    sample_f.write(f"Model: The Language Model\n")
                    sample_f.write(f"query: {query_text.strip()}\n")
                    sample_f.write(f"Retrieved Document ID: {doc_id}\n")
                    sample_f.write(f"Content: {content}\n")
                    sample_f.write("=" * 50 + "\n\n")
    
    print(f"\n" + "="*40)
    print(f"       EVALUATION RESULTS        ")
    print(f"="*40)
    print(f"{'Model':<20} | {'MAP':<10} | {'P@10':<10}")
    print("-" * 46)
    print(f"{'TF-IDF ':<20} | {np.mean(vsm_map_scores):.4f}     | {np.mean(vsm_p10_scores):.4f}")
    print(f"{'Dirichlet LM ':<20} | {np.mean(lm_map_scores):.4f}     | {np.mean(lm_p10_scores):.4f}")
    print("="*40)
    print(f"Detailed results saved to {RESULT_FILE}")
    print(f"Sample detailed output saved to {SAMPLE_RESULT_FILE}")

Parsing CISI Dataset...
Loaded 1460 documents, 112 queries, 76 relevance sets.

Initializing Models...
Building LM Index for 1460 docs...

Running Retrieval...

       EVALUATION RESULTS        
Model                | MAP        | P@10      
----------------------------------------------
TF-IDF               | 0.1908     | 0.3013
Dirichlet LM         | 0.1900     | 0.2711
Detailed results saved to csi_result/cisi_result.txt
Sample detailed output saved to csi_result/cisi_sample_result.txt
