In [17]:
import glob
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [18]:
article_files = [
    'article_01_ai_breakthroughs.txt',
    'article_02_climate_crisis.txt',
    'article_03_renewable_energy.txt',
    'article_04_space_exploration.txt',
    'article_05_cybersecurity.txt',
    'article_06_healthcare_innovations.txt',
    'article_07_economic_trends.txt',
    'article_08_emerging_tech.txt',
    'article_09_sports_highlights.txt',
    'article_10_quantum_computing.txt'
]

documents = {}
for filepath in article_files:
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            documents[filepath] = f.read()
        print(f"✓ Loaded: {filepath}")
    except FileNotFoundError:
        print(f"NOT FOUND: {filepath}")

print(f"\n✅ Total: {len(documents)} documents loaded\n")

doc_names = list(documents.keys())
doc_texts = list(documents.values())

✓ Loaded: article_01_ai_breakthroughs.txt
✓ Loaded: article_02_climate_crisis.txt
✓ Loaded: article_03_renewable_energy.txt
✓ Loaded: article_04_space_exploration.txt
✓ Loaded: article_05_cybersecurity.txt
✓ Loaded: article_06_healthcare_innovations.txt
✓ Loaded: article_07_economic_trends.txt
✓ Loaded: article_08_emerging_tech.txt
✓ Loaded: article_09_sports_highlights.txt
✓ Loaded: article_10_quantum_computing.txt

✅ Total: 10 documents loaded



In [19]:
print("Loading queries...")

queries = []
query_texts = []

with open('queries.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line and 'Query' in line:
            query_text = line.split(':', 1)[1].strip()
            queries.append(line)
            query_texts.append(query_text)
            print(f"✓ {line}")

print(f"\nTotal: {len(query_texts)} queries loaded\n")

Loading queries...
✓ Query 1: artificial intelligence machine learning neural networks
✓ Query 2: climate change global warming carbon emissions
✓ Query 3: renewable energy solar wind power generation
✓ Query 4: space exploration Mars missions astronomy discoveries
✓ Query 5: cybersecurity data breach hacking threats
✓ Query 6: healthcare medical innovations biotechnology
✓ Query 7: quantum computing technology advances
✓ Query 8: sports championships basketball football achievements
✓ Query 9: economic trends technology investment
✓ Query 10: emerging technologies innovation scientific breakthroughs

Total: 10 queries loaded



In [20]:
print("Computing TF-IDF vectors...")

vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,
    max_df=0.9,
    min_df=1,
    ngram_range=(1, 2)
)

doc_tfidf = vectorizer.fit_transform(doc_texts)

query_tfidf = vectorizer.transform(query_texts)

vocabulary = vectorizer.get_feature_names_out()
idf_scores = vectorizer.idf_

print(f"\nTF-IDF Computation Complete!")
print(f"Vocabulary size: {len(vocabulary)} unique terms")
print(f"Document matrix: {doc_tfidf.shape[0]} docs × {doc_tfidf.shape[1]} terms")
print(f"Query matrix: {query_tfidf.shape[0]} queries × {query_tfidf.shape[1]} terms\n")

Computing TF-IDF vectors...

TF-IDF Computation Complete!
Vocabulary size: 4957 unique terms
Document matrix: 10 docs × 4957 terms
Query matrix: 10 queries × 4957 terms



In [21]:
print("TOP 10 DISTINCTIVE TERMS (Highest IDF Scores)")

top_indices = np.argsort(idf_scores)[::-1][:10]

for idx in top_indices:
    print(f"   {vocabulary[idx]:30s} - IDF: {idf_scores[idx]:.4f}")

print()

TOP 10 DISTINCTIVE TERMS (Highest IDF Scores)
   100                            - IDF: 2.7047
   álvarez allegiant              - IDF: 2.7047
   álvarez                        - IDF: 2.7047
   zero trust                     - IDF: 2.7047
   zero                           - IDF: 2.7047
   young talent                   - IDF: 2.7047
   young                          - IDF: 2.7047
   yoshua bengio                  - IDF: 2.7047
   yoshua                         - IDF: 2.7047
   years years                    - IDF: 2.7047



In [22]:
print("Computing Cosine Similarity...")

similarity_matrix = cosine_similarity(query_tfidf, doc_tfidf)

print(f"Similarity matrix: {similarity_matrix.shape[0]} queries × {similarity_matrix.shape[1]} documents")
print(f"   Each cell = similarity score (0 to 1)\n")

Computing Cosine Similarity...
Similarity matrix: 10 queries × 10 documents
   Each cell = similarity score (0 to 1)



In [31]:
print("RANKING DOCUMENTS")

rankings = {}

for i, query_text in enumerate(query_texts):
    scores = similarity_matrix[i]
    doc_scores = list(zip(doc_names, scores))
    doc_scores.sort(key=lambda x: x[1], reverse=True)
    
    rankings[f"Query {i+1}"] = {
        'text': query_text,
        'results': doc_scores[:5]
    }

    print(f"Query {i+1}: {query_text}")
    print("-" * 80)
    for rank, (doc_name, score) in enumerate(doc_scores[:5], 1):
        print(f"  Rank {rank}: {doc_name:45s} Score: {score:.4f}")
    print()

print(f"Ranked top 5 documents for {len(query_texts)} queries\n")

RANKING DOCUMENTS
Query 1: artificial intelligence machine learning neural networks
--------------------------------------------------------------------------------
  Rank 1: article_01_ai_breakthroughs.txt               Score: 0.1481
  Rank 2: article_07_economic_trends.txt                Score: 0.1130
  Rank 3: article_05_cybersecurity.txt                  Score: 0.0483
  Rank 4: article_06_healthcare_innovations.txt         Score: 0.0313
  Rank 5: article_10_quantum_computing.txt              Score: 0.0290

Query 2: climate change global warming carbon emissions
--------------------------------------------------------------------------------
  Rank 1: article_02_climate_crisis.txt                 Score: 0.2100
  Rank 2: article_03_renewable_energy.txt               Score: 0.0338
  Rank 3: article_06_healthcare_innovations.txt         Score: 0.0302
  Rank 4: article_08_emerging_tech.txt                  Score: 0.0267
  Rank 5: article_07_economic_trends.txt                Score: 0.02

In [24]:
print("DETAILED RESULTS")

for query_id, data in rankings.items():
    print("="*80)
    print(f"{query_id}: {data['text']}")
    print("="*80 + "\n")
    
    for rank, (doc_name, score) in enumerate(data['results'], 1):

        if score > 0.3:
            relevance = "Highly Relevant"
        elif score > 0.15:
            relevance = "Moderately Relevant"
        elif score > 0.05:
            relevance = "Somewhat Relevant"
        else:
            relevance = "Low Relevance"
        
        print(f"   Rank {rank}: {doc_name}")
        print(f"   Similarity: {score:.4f}")
        print(f"   {relevance}\n")
    
    print()


DETAILED RESULTS
Query 1: artificial intelligence machine learning neural networks

   Rank 1: article_01_ai_breakthroughs.txt
   Similarity: 0.1481
   Somewhat Relevant

   Rank 2: article_07_economic_trends.txt
   Similarity: 0.1130
   Somewhat Relevant

   Rank 3: article_05_cybersecurity.txt
   Similarity: 0.0483
   Low Relevance

   Rank 4: article_06_healthcare_innovations.txt
   Similarity: 0.0313
   Low Relevance

   Rank 5: article_10_quantum_computing.txt
   Similarity: 0.0290
   Low Relevance


Query 2: climate change global warming carbon emissions

   Rank 1: article_02_climate_crisis.txt
   Similarity: 0.2100
   Moderately Relevant

   Rank 2: article_03_renewable_energy.txt
   Similarity: 0.0338
   Low Relevance

   Rank 3: article_06_healthcare_innovations.txt
   Similarity: 0.0302
   Low Relevance

   Rank 4: article_08_emerging_tech.txt
   Similarity: 0.0267
   Low Relevance

   Rank 5: article_07_economic_trends.txt
   Similarity: 0.0244
   Low Relevance


Query 3: r

In [25]:
print("STATISTICAL SUMMARY")

all_scores = []
for data in rankings.values():
    all_scores.extend([score for _, score in data['results']])

print(f"Statistics:")
print(f"   Average similarity: {np.mean(all_scores):.4f}")
print(f"   Maximum similarity: {np.max(all_scores):.4f}")
print(f"   Minimum similarity: {np.min(all_scores):.4f}")
print(f"   Standard deviation: {np.std(all_scores):.4f}\n")

STATISTICAL SUMMARY
Statistics:
   Average similarity: 0.0721
   Maximum similarity: 0.4405
   Minimum similarity: 0.0000
   Standard deviation: 0.1006



In [30]:
print("GENERATING REPORT")

report_file = f"IR_Analysis_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"

with open(report_file, 'w', encoding='utf-8') as f:
    f.write("="*80 + "\n")
    f.write("INFORMATION RETRIEVAL SYSTEM - ANALYSIS REPORT\n")
    f.write("="*80 + "\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    
    f.write("1. SYSTEM OVERVIEW\n")
    f.write("-" * 80 + "\n")
    f.write(f"Documents: {len(doc_names)}\n")
    f.write(f"Queries: {len(query_texts)}\n")
    f.write(f"Vocabulary: {len(vocabulary)} unique terms\n\n")
    
    f.write("2. DOCUMENTS\n")
    f.write("-" * 80 + "\n")
    for i, doc in enumerate(doc_names, 1):
        f.write(f"{i}. {doc}\n")
    f.write("\n")
    
    f.write("3. QUERIES\n")
    f.write("-" * 80 + "\n")
    for i, query in enumerate(query_texts, 1):
        f.write(f"Query {i}: {query}\n")
    f.write("\n")
    
    f.write("4. TOP DISTINCTIVE TERMS\n")
    f.write("-" * 80 + "\n")
    for idx in top_indices:
        f.write(f"{vocabulary[idx]:30s} IDF: {idf_scores[idx]:.4f}\n")
    f.write("\n")
    
    f.write("5. RANKINGS\n")
    f.write("-" * 80 + "\n\n")
    
    for query_id, data in rankings.items():
        f.write(f"{query_id}: {data['text']}\n")
        f.write("-" * 80 + "\n")
        
        for rank, (doc_name, score) in enumerate(data['results'], 1):
            f.write(f"\nRank {rank}: {doc_name}\n")
            f.write(f"Similarity: {score:.6f}\n")
            
            if score > 0.3:
                f.write("Interpretation: Highly relevant\n")
            elif score > 0.15:
                f.write("Interpretation: Moderately relevant\n")
            elif score > 0.05:
                f.write("Interpretation: Somewhat relevant\n")
            else:
                f.write("Interpretation: Low relevance\n")
        
        f.write("\n")
    
    f.write("\n6. STATISTICS\n")
    f.write("-" * 80 + "\n")
    f.write(f"Average: {np.mean(all_scores):.4f}\n")
    f.write(f"Maximum: {np.max(all_scores):.4f}\n")
    f.write(f"Minimum: {np.min(all_scores):.4f}\n")
    f.write(f"Std Dev: {np.std(all_scores):.4f}\n")

print(f"✅ Report saved: {report_file}\n")

print("="*80)
print("REPORT PREVIEW (First 50 lines)")
print("="*80 + "\n")

with open(report_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for i, line in enumerate(lines[:50], 1):
        print(line.rstrip())

if len(lines) > 50:
    print(f"\n... (Report continues for {len(lines)-50} more lines)")
    print(f"Full report saved in: {report_file}")

print(f"\nCheck the file: {report_file}")

GENERATING REPORT
✅ Report saved: IR_Analysis_Report_20251119_124731.txt

REPORT PREVIEW (First 50 lines)

INFORMATION RETRIEVAL SYSTEM - ANALYSIS REPORT
Generated: 2025-11-19 12:47:31

1. SYSTEM OVERVIEW
--------------------------------------------------------------------------------
Documents: 10
Queries: 10
Vocabulary: 4957 unique terms

2. DOCUMENTS
--------------------------------------------------------------------------------
1. article_01_ai_breakthroughs.txt
2. article_02_climate_crisis.txt
3. article_03_renewable_energy.txt
4. article_04_space_exploration.txt
5. article_05_cybersecurity.txt
6. article_06_healthcare_innovations.txt
7. article_07_economic_trends.txt
8. article_08_emerging_tech.txt
9. article_09_sports_highlights.txt
10. article_10_quantum_computing.txt

3. QUERIES
--------------------------------------------------------------------------------
Query 1: artificial intelligence machine learning neural networks
Query 2: climate change global warming carbon emissio