In [2]:
import zipfile
from pathlib import Path
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# -----------------------------
# 1) Load texts from texts.zip
# -----------------------------
zip_path = "./assignment_support/texts.zip"   # adjust if needed

docs = []
doc_names = []

with zipfile.ZipFile(zip_path, "r") as z:
    # Keep only .txt files (ignore folders)
    txt_files = [f for f in z.namelist() if f.lower().endswith(".txt") and not f.endswith("/")]
    txt_files = sorted(txt_files)  # stable order

    for f in txt_files:
        text = z.read(f).decode("utf-8", errors="ignore")
        docs.append(text)
        doc_names.append(Path(f).name)

print(f"Loaded {len(docs)} documents.")

Loaded 11 documents.


In [10]:
# -----------------------------
# 2) TF-IDF matrix for docs
# -----------------------------
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",   # remove common English words like "the", "and", ...
    ngram_range=(1, 2)      # unigrams + bigrams help with phrases like "fake news"
)

tfidf_docs = vectorizer.fit_transform(docs)  # shape: (n_docs, vocab_size)

In [15]:
# -----------------------------
# 3) Vectorize the query
# -----------------------------
query = "Speech emphasizing patriotism and media criticism and promises for a better future for America."
tfidf_query = vectorizer.transform([query])  # shape: (1, vocab_size)
tfidf_query

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (1, 17555)>

In [13]:
# -----------------------------
# 4) Cosine similarity
# -----------------------------
scores = cosine_similarity(tfidf_query, tfidf_docs).flatten()  # shape: (n_docs,)
scores

array([0.02602709, 0.02624295, 0.06503519, 0.06516257, 0.03754515,
       0.0263123 , 0.06508738, 0.0166582 , 0.05374806, 0.0177566 ,
       0.0315708 ])

In [14]:
# -----------------------------
# 5) Rank documents
# -----------------------------
ranked_idx = np.argsort(scores)[::-1]  # descending

top_k = 10
print("\nTop results:")
for i in ranked_idx[:top_k]:
    print(f"{doc_names[i]:40s}  score={scores[i]:.4f}")


Top results:
Trump Congressional Address.txt           score=0.0652
Trump Inauguration Speech.txt             score=0.0651
Trump CPAC Speech.txt                     score=0.0650
Trump Nomination Speech.txt               score=0.0537
Trump Florida Rally 2-18-17.txt           score=0.0375
Trump Response to Healthcare Bill Failure.txt  score=0.0316
Trump Immigration Speech 8-31-16.txt      score=0.0263
Trump CIA Speech.txt                      score=0.0262
Trump Black History Month Speech.txt      score=0.0260
Trump Police Chiefs Speech.txt            score=0.0178
