In [None]:
import re
import pandas as pd
import numpy as np
import math

# ============================
# Improve Pandas display
# ============================
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

# ============================
# 1. Read positional_index.txt
# ============================

positional_index_file = "positional_index.txt"

terms = {}
current_term = None

with open(positional_index_file, "r") as f:
    for line in f:
        line = line.strip()
        # Detect term line "< antony"
        if line.startswith("< ") and len(line.split()) == 2:
            current_term = line[2:].strip()
            terms[current_term] = {}
        # Detect posting lines "1.txt: 5 ;"
        elif ":" in line:
            doc, positions = line.split(":")
            doc = doc.strip()
            positions = positions.replace(";", "").strip()
            pos_list = [int(p) for p in positions.split(",")]
            terms[current_term][doc] = pos_list

# ======================================
# 2. Compute TF Table
# ======================================

# Convert document names to d1, d2, ...
documents = [f"{i}.txt" for i in range(1, 11)]
renamed_docs = {f"{i}.txt": f"d{i}" for i in range(1, 11)}

tf_rows = []

for term, posting in terms.items():
    row = {"Term": term}
    for doc in documents:
        # original frequency
        freq = len(posting.get(doc, []))
        # renamed column d1..d10
        row[renamed_docs[doc]] = freq
    tf_rows.append(row)

tf_df = pd.DataFrame(tf_rows).set_index("Term")

print("\n==================== TERM FREQUENCY (TF) TABLE ====================\n")
print(tf_df)




           d1  d2  d3  d4  d5  d6  d7  d8  d9  d10
Term                                              
antony      1   1   0   0   0   1   0   0   0    0
brutus      1   1   0   1   0   0   0   0   0    0
caeser      1   1   0   1   1   1   0   0   0    0
calpurnia   0   1   0   0   0   0   0   0   0    0
cleopatra   1   0   0   0   0   0   0   0   0    0
mercy       1   0   1   1   1   1   0   0   0    0
worse       1   0   1   1   1   0   0   0   0    0
angel       0   0   0   0   0   0   1   1   1    0
fool        0   0   0   0   0   0   1   1   1    1
fear        0   0   0   0   0   0   1   1   0    1
in          0   0   0   0   0   0   1   1   1    1
rush        0   0   0   0   0   0   1   1   1    1
to          0   0   0   0   0   0   1   1   1    1
tread       0   0   0   0   0   0   1   1   1    1
where       0   0   0   0   0   0   1   1   1    1


In [26]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

# ======================================
# 2. Compute Binary TF Table
# ======================================

# Prepare document columns d1 to d10
documents = [f"{i}.txt" for i in range(1, 11)]
renamed_docs = {f"{i}.txt": f"d{i}" for i in range(1, 11)}

binary_tf_rows = []

for term, posting in terms.items():
    row = {"Term": term}
    for doc in documents:
        # Get raw frequency (TF)
        raw_tf = len(posting.get(doc, []))
        
        # Binary TF: 1 if term appears, 0 otherwise
        binary_tf = 1 if raw_tf > 0 else 0
        
        # Store in row using 'd1', 'd2', etc. keys
        row[renamed_docs[doc]] = binary_tf
    
    binary_tf_rows.append(row)

# Create DataFrame
if binary_tf_rows:
    binary_tf_df = pd.DataFrame(binary_tf_rows).set_index("Term")
    
    # Ensure columns d1..d10 are in the correct sorted order
    sorted_cols = [f"d{i}" for i in range(1, 11)]
    binary_tf_df = binary_tf_df[sorted_cols]
    
    print("\n==================== WEIGHTED TF(1+ log tf) TABLE ====================\n")
    print(binary_tf_df)
else:
    print("No data found to process.")



           d1  d2  d3  d4  d5  d6  d7  d8  d9  d10
Term                                              
antony      1   1   0   0   0   1   0   0   0    0
brutus      1   1   0   1   0   0   0   0   0    0
caeser      1   1   0   1   1   1   0   0   0    0
calpurnia   0   1   0   0   0   0   0   0   0    0
cleopatra   1   0   0   0   0   0   0   0   0    0
mercy       1   0   1   1   1   1   0   0   0    0
worse       1   0   1   1   1   0   0   0   0    0
angel       0   0   0   0   0   0   1   1   1    0
fool        0   0   0   0   0   0   1   1   1    1
fear        0   0   0   0   0   0   1   1   0    1
in          0   0   0   0   0   0   1   1   1    1
rush        0   0   0   0   0   0   1   1   1    1
to          0   0   0   0   0   0   1   1   1    1
tread       0   0   0   0   0   0   1   1   1    1
where       0   0   0   0   0   0   1   1   1    1


In [22]:
# ======================================
# 3. Compute DF + IDF 
# ======================================

N = 10  # number of documents
df_dict = {}
idf_dict = {}

for term, posting in terms.items():
    df = len(posting)          # DF = number of docs containing the term
    idf = math.log10(N / df)   # IDF
    df_dict[term] = df
    idf_dict[term] = round(idf, 6)

# Build combined table
df_idf_df = pd.DataFrame({
    "DF": df_dict,
    "IDF": idf_dict
})

pd.set_option("display.max_rows", None)
pd.set_option("display.width", 200)

print("\n========================== DF & IDF TABLE ==============================\n")
print(df_idf_df)




           DF       IDF
antony      3  0.522879
brutus      3  0.522879
caeser      5  0.301030
calpurnia   1  1.000000
cleopatra   1  1.000000
mercy       5  0.301030
worse       4  0.397940
angel       3  0.522879
fool        4  0.397940
fear        3  0.522879
in          4  0.397940
rush        4  0.397940
to          4  0.397940
tread       4  0.397940
where       4  0.397940


In [29]:
# ======================================
# 4. Compute TF Ã— IDF Matrix 
# ======================================

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 200)

# Create TF-IDF copy
tfidf_df = tf_df.copy().astype(float)

# Multiply TF by IDF for each term
for term in tfidf_df.index:
    idf_value = idf_df.loc[term, "IDF"]   # get IDF for this term
    tfidf_df.loc[term] = tfidf_df.loc[term] * idf_value


# print 
print("\n==================== TF Ã— IDF MATRIX ====================\n")

# Header row
header = "Term".ljust(12) + "".join([col.ljust(10) for col in tfidf_df.columns])
print(header)

# Table rows
for term in tfidf_df.index:
    row = term.ljust(12)
    for col in tfidf_df.columns:
        value = f"{tfidf_df.loc[term, col]:.6f}"
        row += value.ljust(10)
    print(row)




Term        d1        d2        d3        d4        d5        d6        d7        d8        d9        d10       
antony      0.522879  0.522879  0.000000  0.000000  0.000000  0.522879  0.000000  0.000000  0.000000  0.000000  
brutus      0.522879  0.522879  0.000000  0.522879  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
caeser      0.301030  0.301030  0.000000  0.301030  0.301030  0.301030  0.000000  0.000000  0.000000  0.000000  
calpurnia   0.000000  1.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
cleopatra   1.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
mercy       0.301030  0.000000  0.301030  0.301030  0.301030  0.301030  0.000000  0.000000  0.000000  0.000000  
worse       0.397940  0.000000  0.397940  0.397940  0.397940  0.000000  0.000000  0.000000  0.000000  0.000000  
angel       0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.522879  0.522879  0.

In [28]:
documents = [f"{i}.txt" for i in range(1, 11)]
renamed_docs = {f"{i}.txt": f"d{i}" for i in range(1, 11)}
N = len(documents) # Total number of documents (10)

# We will store the squared weights sum for each document here
doc_sq_sum = {d: 0.0 for d in renamed_docs.values()}

for term, posting in terms.items():
    # --- Calculate IDF for this term ---
    df = len(posting) # Document Frequency (number of docs having this term)
    if df > 0:
        idf = math.log10(N / df)
    else:
        idf = 0

    # --- Calculate wTF-IDF for each document ---
    for doc_filename in documents:
        doc_label = renamed_docs[doc_filename]

        # Get raw frequency
        raw_tf = len(posting.get(doc_filename, []))

        if raw_tf > 0:
            # 1. Weighted TF
            w_tf = 1 + math.log10(raw_tf)

            # 2. TF-IDF
            tf_idf = w_tf * idf

            # 3. Add to the sum of squares for this document
            doc_sq_sum[doc_label] += tf_idf ** 2

# ============================
# 3. Final Calculation and Output
# ============================

print("\n==================== DOCUMENT LENGTHS ====================\n")

# Calculate Square Root of the sum (Euclidean Norm)
for i in range(1, 11):
    doc_label = f"d{i}"
    length = math.sqrt(doc_sq_sum[doc_label])
    print(f"{doc_label} length {length:.6f}")



d1 length 1.373462
d2 length 1.279618
d3 length 0.498974
d4 length 0.782941
d5 length 0.582747
d6 length 0.674270
d7 length 1.223496
d8 length 1.223496
d9 length 1.106137
d10 length 1.106137


In [25]:
# ============================
# 2. Compute TF-IDF Matrix (Un-normalized)
# ============================

documents = [f"{i}.txt" for i in range(1, 11)]
renamed_docs = {f"{i}.txt": f"d{i}" for i in range(1, 11)}
N = len(documents)

tfidf_data = {}
all_terms = list(terms.keys())

for term in all_terms:
    posting = terms[term]
    df = len(posting)
    # Calculate IDF
    idf = math.log10(N / df) if df > 0 else 0
    
    tfidf_data[term] = {}
    for doc_file in documents:
        doc_name = renamed_docs[doc_file]
        freq = len(posting.get(doc_file, []))
        
        if freq > 0:
            # 1. Weighted TF (wTF)
            wtf = 1 + math.log10(freq)
            # 2. TF-IDF
            tfidf = wtf * idf
        else:
            tfidf = 0.0
        
        tfidf_data[term][doc_name] = tfidf

# Create DataFrame
df_tfidf = pd.DataFrame(tfidf_data).T 
# Sort columns d1, d2...
df_tfidf = df_tfidf[sorted(df_tfidf.columns, key=lambda x: int(x[1:]))] 

# ============================
# 3. Normalize (L2 Norm)
# ============================

# Calculate Euclidean Length for each document (column)
# sqrt(sum(x^2))
doc_lengths = np.sqrt((df_tfidf ** 2).sum(axis=0))

# Divide each column by its length
df_normalized = df_tfidf.div(doc_lengths, axis=1).fillna(0)

print("\n==================== NORMALIZED TF-IDF MATRIX ====================\n")
print(df_normalized)



                 d1        d2        d3        d4        d5        d6        d7        d8        d9       d10
antony     0.380701  0.408621  0.000000  0.000000  0.000000  0.775474  0.000000  0.000000  0.000000  0.000000
brutus     0.380701  0.408621  0.000000  0.667839  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
caeser     0.219176  0.235250  0.000000  0.384486  0.516570  0.446453  0.000000  0.000000  0.000000  0.000000
calpurnia  0.000000  0.781483  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
cleopatra  0.728087  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
mercy      0.219176  0.000000  0.603298  0.384486  0.516570  0.446453  0.000000  0.000000  0.000000  0.000000
worse      0.289735  0.000000  0.797516  0.508263  0.682869  0.000000  0.000000  0.000000  0.000000  0.000000
angel      0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.427365  0.427365  0.472707  0.000000
fool    

# ðŸ“˜ Information Retrieval: TF â€” IDF â€” TF-IDF  
Using Our 10-Document Dataset

---

## ðŸ”¹ 1. Term Frequency (TF)

**Term Frequency (TF)** measures how many times a word appears in a specific document.

### âœ” How we computed TF  
We used the positional index generated by the Spark app:

Example from `positional_index.txt`:

- < mercy
  - 1.txt: 5 ;
  - 3.txt: 1 ;
  - 4.txt: 3 ;

This tells us:

- In **1.txt**, the word *mercy* appears **5 times**  
- In **3.txt**, it appears **1 time**  
- In **4.txt**, it appears **3 times**  

### âœ” TF Example Table (From our dataset)

| Term  | 1.txt | 2.txt | 3.txt | 4.txt |
|-------|-------|-------|--------|-------|
| mercy |   5   |   0   |   1    |   3   |
| caeser|   3   |   3   |   0    |   2   |
| angel |   0   |   0   |   0    |   0   |

---

## ðŸ”¹ 2. Inverse Document Frequency (IDF)

**IDF** measures how rare or common a word is across all documents.

$$
\text{IDF}(t) = \log_{10}\left(\frac{N}{df(t)}\right)
$$

Where:  
- **N = 10** documents  
- **df(t)** = number of documents containing the term

### âœ” IDF Example Using Our Dataset

| Term     | df (docs containing it) | IDF value          |
|----------|---------------------------|---------------------|
| cleopatra | 1 | log10(10/1) = **1.00000** |
| mercy      | 5 | log10(10/5) = **0.30103** |
| tread      | 4 | log10(10/4) = **0.39794** |
| angel     | 3 | log10(10/3) = **0.52288** |

### âœ” Meaning  
- **High IDF** â†’ rare, meaningful word  
- **Low IDF** â†’ common, less informative word  

Example:  
- *cleopatra* appears only in **1 document**, so IDF is high â†’ it strongly represents that document  
- *mercy* appears in many documents â†’ lower IDF

---

## ðŸ”¹ 3. TF-IDF (Term Frequency Ã— Inverse Document Frequency)

TF-IDF shows how important a word is *in a specific document*, considering both:
- How often it appears (TF)
- How rare it is across all documents (IDF)

$$
TF\!-\!IDF(t,d) = TF(t,d) \times IDF(t)
$$

### âœ” Example From Our Dataset

For the term **mercy**:

- TF(mercy, 1.txt) = 5  
- IDF(mercy) = 0.30103  

$$
TF\!-\!IDF = 5 \times 0.30103 = 1.50515
$$

For the term **cleopatra**:

- TF(cleopatra, 1.txt) = 1  
- IDF(cleopatra) = 1.00000  

$$
TF\!-\!IDF = 1 \times 1.0 = 1.0
$$

### âœ” Why TF-IDF is useful?
- Gives **higher weight to rare but important terms**  
- Reduces weight of common words  
- Converts documents into **numeric vectors**  
- Enables similarity calculations, ranking, and search

---

## ðŸ”¹ 4. Why TF-IDF Matters in IR?

With TF-IDF we can:

âœ” Compare queries with documents  
âœ” Rank documents by importance  
âœ” Identify which documents are most relevant  
âœ” Build search engines, retrieval systems, and recommendation models  

Example:  
If the query is:
mercy caeser


TF-IDF helps us determine:
- Which documents talk about "mercy" a lot  
- Which documents mention "caeser" in a meaningful way  
- Which documents match both â†’ highest similarity score  

---

## âœ… Summary

- **TF** = how many times a term appears in a document  
- **IDF** = how rare the term is across documents  
- **TF-IDF** = importance of a word inside a specific document  
- **Used to rank documents, match queries, and measure similarity**

This process is the core of modern Information Retrieval engines.


