# LSI System applied to TIME dataset

In [42]:
from src.sys_config import SystemConfiguration
from src.lsi import LSI

Let's define a function to load the queries from a .QUE file and return them as a list of strings.

In [None]:
def load_time_queries(query_file_path):
    queries = []

    with open(query_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()

    current_query_text = []
    current_query_id = None

    for line in lines:
        stripped = line.strip()

        if stripped.startswith("*FIND"):
            if current_query_id is not None and current_query_text:
                queries.append(' '.join(current_query_text).strip())
                current_query_text = []
            try:
                current_query_id = int(stripped.replace("*FIND", "").strip())
            except ValueError:
                current_query_id = None
        elif stripped:
            current_query_text.append(stripped)

    if current_query_id is not None and current_query_text:
        queries.append(' '.join(current_query_text).strip())

    return queries



Now let's load the queries from the TIME dataset.

In [33]:
que_path = "data/TIME.QUE"
time_queries = load_time_queries(que_path)

And let's config an LSI systems

In [50]:
config = SystemConfiguration(
    n_components=5,  
    metric="tf-idf"
)

In [51]:
data_path = "data/TIME.ALL"

system = LSI(data_path, config)

Setting up LSI-IR System...
Configuration: 5 components, tf-idf metric
1. Parsing documents...
   Loaded 423 documents
2. Preprocessing documents...
   Loaded 340 custom stopwords from data/TIME.STP
3. Building term-document matrix...
   Matrix shape: (18102, 423)
4. Computing LSI decomposition...
5. System ready!
   Vocabulary size: 18102
   LSI dimensions: 5


In [52]:
for i, query in enumerate(time_queries, 1):
    print(f"\n[{i}] Query: '{query}'")
    print("-" * 40)

    doc_indices, similarities = system.retrieve(query, n_docs=3)

    for j, (doc_idx, sim) in enumerate(zip(doc_indices, similarities)):
        doc_id = int(system.parsed_df.iloc[doc_idx]['I'])
        title = system.document_indexes[doc_idx] if system.document_indexes[doc_idx] else f"Document {doc_id}"
        
        print(f"  {j+1}. [Doc {doc_id}] Similarity: {sim:.4f}")
        print(f"     Title: {title[:70]}...")
       
        if 'W' in system.parsed_df.columns:
            content = system.parsed_df.iloc[doc_idx]['W']
            if content and len(content.strip()) > 0:
                preview = content[:100] + "..." if len(content) > 100 else content
                print(f"     Preview: {content}")
        print()


[1] Query: 'KENNEDY ADMINISTRATION PRESSURE ON NGO DINH DIEM TO STOP SUPPRESSING THE BUDDHISTS .'
----------------------------------------
  1. [Doc 320] Similarity: 0.9984
     Title: SOUTH VIET NAM 2-12 THE RELIGIOUS CRISIS A DUSK-TO-DAWN CURFEW EMPTIED...
     Preview: THE STREETS OF THE ANCIENT VIETNAMESE CAPITAL OF HUE, 400 MILES NORTH OF SAIGON . RIOT POLICE AND ARMORED PERSONNEL CARRIERS PATROLLED THE DARK AND DESERTED CITY . ROADBLOCKS WERE SET UP ON THE OUTSKIRTS, AND BARBED-WIRE BARRICADES ENCIRCLED THE SACRED TUDAM PAGODA . THESE GOVERNMENT SECURITY MEASURES WERE NOT A PRECAUTION AGAINST AN ATTACK BY COMMUNIST GUERRILLAS ; THEY WERE TAKEN TO QUELL DEMONSTRATIONS BY HUE'S BUDDHIST POPULATION AGAINST THE REGIME OF ROMAN CATHOLIC PRESIDENT NGO DINH DIEM . WHILE ALL THE WORLD'S ATTENTION WAS FOCUSED ON SOUTH VIET NAM'S BITTER STRUGGLE AGAINST THE REDS, THE COUNTRY WAS DIVIDED BY A RELIGIOUS CONFLICT THAT MIGHT IMPERIL THE ENTIRE COURSE OF THE WAR AGAINST THE VIET CONG . MORALIT