# IRWA Project – Part 3  
## Conjunctive ranking (TF-IDF, BM25, custom score) and Word2Vec


In [2]:
import pandas as pd

# Si estás en Google Colab:
from google.colab import files
uploaded = files.upload()  # selecciona aquí tu fashion_products_clean.csv

import io
df = pd.read_csv(io.BytesIO(uploaded['fashion_products_clean.csv']))

print(df.shape)

display(df.head())

Saving fashion_products_clean.csv to fashion_products_clean.csv
(28080, 26)


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,...,url,clean_title,clean_description,sentence_count,product_details_text,combined_info,discount_percent,title_length,desc_length,discount_value
0,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,2999.0,3.9,York,Clothing and Accessories,1612987911000,Yorker trackpants made from 100% rich combed c...,69% off,['https://rukminim1.flixcart.com/image/128/128...,False,...,https://www.flipkart.com/yorker-solid-men-mult...,solid women multicolor track pant,yorker trackpant made 100 rich comb cotton giv...,2,1005COMBO2 Elastic Side Pockets Cotton Blend S...,Clothing and Accessories Bottomwear York Shyam...,69.0,5,21,69.0
1,893e6980-f2a0-531f-b056-34dd63fe912c,1499.0,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,66% off,['https://rukminim1.flixcart.com/image/128/128...,False,...,https://www.flipkart.com/yorker-solid-men-blue...,solid men blue track pant,yorker trackpant made 100 rich comb cotton giv...,2,"1005BLUE Drawstring, Elastic Side Pockets Cott...",Clothing and Accessories Bottomwear York Shyam...,66.0,5,21,66.0
2,eb4c8eab-8206-59d0-bcd1-a724d96bf74f,2999.0,3.9,York,Clothing and Accessories,1612987912000,Yorker trackpants made from 100% rich combed c...,68% off,['https://rukminim1.flixcart.com/image/128/128...,False,...,https://www.flipkart.com/yorker-solid-men-mult...,solid men multicolor track pant,yorker trackpant made 100 rich comb cotton giv...,2,1005COMBO4 Elastic Side Pockets Cotton Blend S...,Clothing and Accessories Bottomwear York Shyam...,68.0,5,21,68.0
3,3f3f97bb-5faf-57df-a9ff-1af24e2b1045,2999.0,3.9,York,Clothing and Accessories,1612987913000,Yorker trackpants made from 100% rich combed c...,69% off,['https://rukminim1.flixcart.com/image/128/128...,False,...,https://www.flipkart.com/yorker-solid-men-mult...,solid women multicolor track pant,yorker trackpant made 100 rich comb cotton giv...,2,1005COMBO3 Elastic Side Pockets Cotton Blend S...,Clothing and Accessories Bottomwear York Shyam...,69.0,5,21,69.0
4,750caa3d-6264-53ca-8ce1-94118a1d8951,2999.0,3.9,York,Clothing and Accessories,1612987913000,Yorker trackpants made from 100% rich combed c...,68% off,['https://rukminim1.flixcart.com/image/128/128...,False,...,https://www.flipkart.com/yorker-solid-men-brow...,solid women brown grey track pant,yorker trackpant made 100 rich comb cotton giv...,2,"1005COMBO1 Drawstring, Elastic Side Pockets Co...",Clothing and Accessories Bottomwear York Shyam...,68.0,6,21,68.0


In [22]:
print(df.columns)

Index(['_id', 'actual_price', 'average_rating', 'brand', 'category',
       'crawled_at', 'description', 'discount', 'images', 'out_of_stock',
       'pid', 'product_details', 'seller', 'selling_price', 'sub_category',
       'title', 'url', 'clean_title', 'clean_description', 'sentence_count',
       'product_details_text', 'combined_info', 'discount_percent',
       'title_length', 'desc_length', 'discount_value', 'full_text'],
      dtype='object')


## 1. Text field and inverted index (conjunctive pipeline)

We build a single text field (`full_text`) from `clean_title` and `clean_description`,
tokenize it, and create an inverted index.  
This index is later used to enforce the **AND** condition on all query terms.


In [23]:
text_cols = ["clean_title", "clean_description"]
df[text_cols].head()

Unnamed: 0,clean_title,clean_description
0,solid women multicolor track pant,yorker trackpant made 100 rich comb cotton giv...
1,solid men blue track pant,yorker trackpant made 100 rich comb cotton giv...
2,solid men multicolor track pant,yorker trackpant made 100 rich comb cotton giv...
3,solid women multicolor track pant,yorker trackpant made 100 rich comb cotton giv...
4,solid women brown grey track pant,yorker trackpant made 100 rich comb cotton giv...


In [24]:
import re
from collections import defaultdict


df["full_text"] = (
    df["clean_title"].fillna("") + " " +
    df["clean_description"].fillna("")
)


def tokenize(text):
    return re.findall(r"\b[a-zA-Z]+\b", text.lower())


inverted_index = defaultdict(set)

for idx, text in enumerate(df["full_text"]):
    for token in set(tokenize(text)):
        inverted_index[token].add(idx)

len(inverted_index)

5704

## 2. TF-IDF ranking with cosine similarity

We apply TF-IDF on `full_text` and use cosine similarity as the scoring function
on top of the **conjunctive candidate set** returned by the inverted index.


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- 1. Vectorize the full collection ---
vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    lowercase=True,
    stop_words='english'
)
tfidf_matrix = vectorizer.fit_transform(df["full_text"])

# --- 2. Function to get the intersection of documents ---
def and_filter(query):
    tokens = tokenize(query)
    if not tokens:
        return []
    postings = [inverted_index[t] for t in tokens if t in inverted_index]
    if len(postings) < len(tokens):
        return []   # at least one term is missing → no matching docs
    return list(set.intersection(*map(set, postings)))

# --- 3. TF-IDF + cosine ranking ---
def search_tfidf(query, top_k=20):
    docs = and_filter(query)
    if not docs:
        return []

    q_vec = vectorizer.transform([query])
    doc_vecs = tfidf_matrix[docs]
    sims = cosine_similarity(q_vec, doc_vecs).flatten()

    ranked_idx = np.argsort(-sims)[:top_k]
    results = [(docs[i], sims[i]) for i in ranked_idx]
    return results

# quick test
search_tfidf("women blue cotton tshirt", top_k=5)




[(8766, np.float64(0.7648759668726784)),
 (8727, np.float64(0.5744130509721134)),
 (8756, np.float64(0.5585633427485308)),
 (11092, np.float64(0.5483319343640718)),
 (24894, np.float64(0.5360105523962889))]

## 3. BM25 ranking

We build a BM25 index over the same tokenized documents and use it as an
alternative lexical ranking function, still on top of the conjunctive candidate set.


In [26]:
!pip install rank-bm25

from rank_bm25 import BM25Okapi
import numpy as np

# 1. Tokenize all documents (same tokenizer)
tokenized_docs = [tokenize(text) for text in df["full_text"]]

# 2. Create the BM25 model
bm25 = BM25Okapi(tokenized_docs)

# 3. BM25 search function with AND-filtering
def search_bm25(query, top_k=20):
    docs = and_filter(query)
    if not docs:
        return []

    q_tokens = tokenize(query)
    # compute scores only for filtered docs
    scores = bm25.get_scores(q_tokens)
    scores = scores[docs]  # filter scores

    ranked_idx = np.argsort(-scores)[:top_k]
    results = [(docs[i], scores[i]) for i in ranked_idx]
    return results

# quick test
search_bm25("women blue cotton tshirt", top_k=5)



[(8766, np.float64(4.337386312742335)),
 (20001, np.float64(4.186146168069024)),
 (19979, np.float64(4.186146168069024)),
 (11491, np.float64(4.150000984655385)),
 (8756, np.float64(4.117502537402832))]

## 4. Custom ranking function ("Your score")

We combine the BM25 textual score with numerical fields from the dataset:

- `average_rating`
- `discount_percent`
- `out_of_stock`

to produce a single ranking score that reflects both textual relevance and product quality/availability.


In [27]:
import numpy as np

def search_your_score(query, top_k=20,
                      w_text=1.0, w_rating=0.3, w_discount=0.1, w_stock=0.5):

    docs = and_filter(query)
    if not docs:
        return []

    # --- 1. Textual score using BM25 ---
    q_tokens = tokenize(query)
    bm25_scores = bm25.get_scores(q_tokens)
    bm25_scores = bm25_scores[docs]

    # --- 2. Numerical features ---
    ratings = df.loc[docs, "average_rating"].fillna(0).values
    discounts = df.loc[docs, "discount_percent"].fillna(0).values
    out_stock = df.loc[docs, "out_of_stock"].astype(int).values  # 1 if out of stock

    # simple normalizations
    ratings_norm = ratings / 5.0
    discounts_norm = discounts / 100.0

    # --- 3. Final score ---
    score = (
        w_text * bm25_scores +
        w_rating * ratings_norm +
        w_discount * discounts_norm -
        w_stock * out_stock
    )

    ranked_idx = np.argsort(-score)[:top_k]
    results = [(docs[i], score[i]) for i in ranked_idx]
    return results

# quick test
search_your_score("women blue cotton tshirt", top_k=5)



[(8766, np.float64(4.587386312742335)),
 (20001, np.float64(4.464146168069024)),
 (19979, np.float64(4.464146168069024)),
 (11491, np.float64(4.412000984655385)),
 (13848, np.float64(4.4075531456773))]

## 5. Word2Vec / GloVe ranking

We use pre-trained GloVe word embeddings (Word2Vec-style distributional vectors)
to represent each document and each query as the average of their word vectors,
and then rank by cosine similarity (again on the conjunctive candidate set).


In [28]:
!pip install gensim

import numpy as np
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load lightweight Word2Vec-style model
model = api.load("glove-wiki-gigaword-100")

# 2. Function to obtain a text embedding
def doc_vector(text):
    tokens = tokenize(text)
    vectors = [model[w] for w in tokens if w in model]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# 3. Precompute embeddings for all documents
doc_embeddings = np.vstack([doc_vector(t) for t in df["full_text"]])

# 4. Word2Vec-style ranking using cosine similarity
def search_word2vec(query, top_k=20):
    docs = and_filter(query)
    if not docs:
        return []

    q_vec = doc_vector(query).reshape(1, -1)
    doc_vecs = doc_embeddings[docs]

    sims = cosine_similarity(q_vec, doc_vecs).flatten()
    ranked = np.argsort(-sims)[:top_k]

    return [(docs[i], sims[i]) for i in ranked]

# quick test
search_word2vec("women blue cotton tshirt", top_k=5)




[(13848, np.float32(0.92468244)),
 (11092, np.float32(0.91719115)),
 (19211, np.float32(0.9152438)),
 (19301, np.float32(0.9147768)),
 (19300, np.float32(0.9147768))]

In [29]:
queries = [
    "women blue cotton tshirt",
    "men black jeans slim fit",
    "cotton round neck sweatshirt",
    "women red dress long sleeve",
    "men leather jacket brown",
    "kids white sneakers",
]

results_all = {}

for q in queries:
    results_all[q] = {
        "tfidf": search_tfidf(q, top_k=20),
        "bm25": search_bm25(q, top_k=20),
        "your_score": search_your_score(q, top_k=20),
        "word2vec": search_word2vec(q, top_k=20)
    }

results_all

{'women blue cotton tshirt': {'tfidf': [(8766, np.float64(0.7648759668726784)),
   (8727, np.float64(0.5744130509721134)),
   (8756, np.float64(0.5585633427485308)),
   (11092, np.float64(0.5483319343640718)),
   (24894, np.float64(0.5360105523962889)),
   (24870, np.float64(0.5317719319659437)),
   (24867, np.float64(0.5317719319659437)),
   (24869, np.float64(0.5317719319659437)),
   (11398, np.float64(0.5248722098693234)),
   (13912, np.float64(0.5201885147213349)),
   (24882, np.float64(0.5104654452319364)),
   (24901, np.float64(0.5104654452319364)),
   (24888, np.float64(0.509363846379485)),
   (24740, np.float64(0.509363846379485)),
   (20001, np.float64(0.49181961602225666)),
   (19979, np.float64(0.49181961602225666)),
   (12480, np.float64(0.4809413796899168)),
   (11491, np.float64(0.4716028155359804)),
   (12181, np.float64(0.4647432743551818)),
   (12116, np.float64(0.4635760222766096))],
  'bm25': [(8766, np.float64(4.337386312742335)),
   (20001, np.float64(4.18614616806

## 6. Formatting and comparing rankings

Helper functions to:

- convert document IDs into human-readable results (title + URL + score),
- run all four ranking methods for a given query,
- summarise and compare the rankings.


In [35]:
import pandas as pd

# ============================================
# 1. Function to convert doc_id → readable information
# ============================================

def format_results(results):
    """Converts a list of (doc_id, score) into a DataFrame with readable information."""
    rows = []
    for doc_id, score in results:
        row = {
            "doc_id": doc_id,
            "title": df.loc[doc_id, "clean_title"],
            "url": df.loc[doc_id, "url"],
            "score": float(score)
        }
        rows.append(row)
    return pd.DataFrame(rows)


# ============================================
# 2. Function to obtain results from all four methods
# ============================================

def get_all_rankings(query, top_k=20):
    return {
        "tfidf": format_results(search_tfidf(query, top_k)),
        "bm25": format_results(search_bm25(query, top_k)),
        "your_score": format_results(search_your_score(query, top_k)),
        "word2vec": format_results(search_word2vec(query, top_k))
    }

def summarize_hits(queries, top_k=20):
    """
    For each query, count how many documents each method retrieves
    (useful to quickly see where the AND condition leaves the list empty).
    """
    rows = []
    for q in queries:
        rankings = get_all_rankings(q, top_k=top_k)
        row = {"query": q}
        for m in ["tfidf", "bm25", "your_score", "word2vec"]:
            row[f"{m}_hits"] = len(rankings[m])
        rows.append(row)
    return pd.DataFrame(rows)

# ============================================
# 3. Your 5 official queries for the assignment
# ============================================

queries = [
    "women blue cotton tshirt",
    "men black jeans slim fit",
    "cotton round neck sweatshirt",
    "women red dress long sleeve",
    "men leather jacket brown",
    "kids white sneakers",
]


# ============================================
# 4. Generate results for ALL queries
# ============================================

all_results = {}

for q in queries:
    print("Processing:", q)
    all_results[q] = get_all_rankings(q, top_k=20)

print("\nDONE. all_results contains all tables ready for the report.")


# ============================================
# 5. (Optional) Save the results to CSV
# ============================================

for q in queries:
    safe_q = q.replace(" ", "_")
    for method in ["tfidf", "bm25", "your_score", "word2vec"]:
        df_out = all_results[q][method]
        df_out.to_csv(f"results_{safe_q}_{method}.csv", index=False)

print("\nCSV files saved for documentation.")


Processing: women blue cotton tshirt
Processing: men black jeans slim fit
Processing: cotton round neck sweatshirt
Processing: women red dress long sleeve
Processing: men leather jacket brown
Processing: kids white sneakers

DONE. all_results contains all tables ready for the report.

CSV files saved for documentation.


In [41]:
# Vista rápida del número de resultados por método y query
summary_df = summarize_hits(queries, top_k=20)
display(summary_df)


Unnamed: 0,query,tfidf_hits,bm25_hits,your_score_hits,word2vec_hits
0,women blue cotton tshirt,20,20,20,20
1,men black jeans slim fit,0,0,0,0
2,cotton round neck sweatshirt,20,20,20,20
3,women red dress long sleeve,0,0,0,0
4,men leather jacket brown,0,0,0,0
5,kids white sneakers,0,0,0,0


### Important note on empty results under conjunctive retrieval

The assignment requires a *strict conjunctive pipeline*, meaning that a
document is only considered a candidate if it contains **all** the
query terms after preprocessing. Because our preprocessing modifies
tokens (lowercasing, removing punctuation, normalizing forms such as
"jeans"→"jean" or "slim fit"→"slimfit"), some queries have no documents
that contain *every* term exactly as written. Therefore, these queries
return empty result sets for all four ranking methods.

This is expected behavior under strict AND filtering and not an error.


## 7. Per-query comparison and qualitative analysis

Below we print a side-by-side comparison of the top results returned by
TF-IDF, BM25, the custom score, and Word2Vec for each of the five queries.
These tables form the basis of the analysis provided in the written report
(where we discuss differences in ranking behavior, advantages, and
limitations of each method).


In [40]:
def compare_methods_for_query(query, top_k=10, add_metadata=True):
    """
    Runs the 4 methods for a given query and returns a single DataFrame with:
    - method
    - rank
    - doc_id
    - title
    - score
    (+ optionally average_rating, discount_percent, out_of_stock)
    """
    rankings = get_all_rankings(query, top_k=top_k)
    frames = []

    for method, df_res in rankings.items():
        if df_res.empty:
            continue

        tmp = df_res.copy()
        tmp.insert(0, "method", method)
        tmp.insert(1, "rank", range(1, len(tmp) + 1))

        if add_metadata:
            tmp["average_rating"] = df.loc[tmp["doc_id"], "average_rating"].values
            tmp["discount_percent"] = df.loc[tmp["doc_id"], "discount_percent"].values
            tmp["out_of_stock"] = df.loc[tmp["doc_id"], "out_of_stock"].values

        frames.append(tmp)

    if not frames:
        print(f"No documents retrieved for query: '{query}'")
        return None

    comparison_df = pd.concat(frames, ignore_index=True)
    display(comparison_df)
    return comparison_df


# Compare all methods for all queries
for q in queries:
    print("\n==============================")
    print("QUERY:", q)
    print("==============================")
    compare_methods_for_query(q, top_k=20)




QUERY: women blue cotton tshirt


Unnamed: 0,method,rank,doc_id,title,url,score,average_rating,discount_percent,out_of_stock
0,tfidf,1,8766,print women round neck blue tshirt,https://www.flipkart.com/mash-unlimited-printe...,0.764876,3.2,58.0,False
1,tfidf,2,8727,print women round neck blue tshirt,https://www.flipkart.com/mash-unlimited-printe...,0.574413,3.2,58.0,False
2,tfidf,3,8756,solid women round neck blue tshirt,https://www.flipkart.com/mash-unlimited-solid-...,0.558563,3.2,58.0,False
3,tfidf,4,11092,solid women polo neck blue tshirt,https://www.flipkart.com/steenbok-solid-men-po...,0.548332,3.7,56.0,False
4,tfidf,5,24894,self design women polo neck blue tshirt,https://www.flipkart.com/athliv-self-design-me...,0.536011,3.8,67.0,False
...,...,...,...,...,...,...,...,...,...
75,word2vec,16,20001,self design women polo neck blue tshirt,https://www.flipkart.com/adam-parker-self-desi...,0.904229,4.1,32.0,False
76,word2vec,17,8766,print women round neck blue tshirt,https://www.flipkart.com/mash-unlimited-printe...,0.896200,3.2,58.0,False
77,word2vec,18,24888,self design women polo neck light blue tshirt,https://www.flipkart.com/athliv-self-design-me...,0.894840,3.3,67.0,False
78,word2vec,19,24740,self design women polo neck light blue tshirt,https://www.flipkart.com/athliv-self-design-me...,0.894840,3.3,67.0,False



QUERY: men black jeans slim fit
No documents retrieved for query: 'men black jeans slim fit'

QUERY: cotton round neck sweatshirt


Unnamed: 0,method,rank,doc_id,title,url,score,average_rating,discount_percent,out_of_stock
0,tfidf,1,23179,full sleev solid women sweatshirt,https://www.flipkart.com/sketch-vibes-full-sle...,0.396741,2.3,38.0,False
1,tfidf,2,23195,full sleev solid women sweatshirt,https://www.flipkart.com/sketch-vibes-full-sle...,0.396741,2.3,38.0,False
2,tfidf,3,23193,full sleev solid men sweatshirt,https://www.flipkart.com/sketch-vibes-full-sle...,0.396728,2.3,38.0,False
3,tfidf,4,23180,full sleev solid men sweatshirt,https://www.flipkart.com/sketch-vibes-full-sle...,0.396728,2.3,38.0,False
4,tfidf,5,14783,full sleev graphic print men sweatshirt,https://www.flipkart.com/tee-buddy-full-sleeve...,0.276862,3.8,70.0,False
...,...,...,...,...,...,...,...,...,...
75,word2vec,16,23870,print men round neck multicolor tshirt pack 2,https://www.flipkart.com/free-authority-printe...,0.737104,3.9,45.0,False
76,word2vec,17,23833,print men round neck multicolor tshirt pack 2,https://www.flipkart.com/free-authority-printe...,0.737055,3.9,45.0,False
77,word2vec,18,23199,full sleev solid women sweatshirt,https://www.flipkart.com/sketch-vibes-full-sle...,0.736800,2.3,38.0,False
78,word2vec,19,23201,full sleev solid women sweatshirt,https://www.flipkart.com/sketch-vibes-full-sle...,0.736800,2.3,38.0,False



QUERY: women red dress long sleeve
No documents retrieved for query: 'women red dress long sleeve'

QUERY: men leather jacket brown
No documents retrieved for query: 'men leather jacket brown'

QUERY: kids white sneakers
No documents retrieved for query: 'kids white sneakers'
