In [22]:
# Imports
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
from collections import defaultdict
from array import array
import time
import numpy as np
import math
import collections
from numpy import linalg as la
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [23]:
# Load data
processed_df = pd.read_csv("/content/drive/Shareddrives/IRWA/P2/processed_df.csv")
clean_df = pd.read_csv("/content/drive/Shareddrives/IRWA/P2/clean_df.csv")
validation_df = pd.read_csv("/content/drive/Shareddrives/IRWA/P2/validation_labels.csv")

In [24]:
# Part1 function
def preproces_text(text):
    if not isinstance(text, str):
        return []

    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]

    return tokens

In [25]:
# Indexing
def create_index_tfidf(dataframe, clean_df, columns, num_documents):
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    title_index = defaultdict(str)
    idf = defaultdict(float)

    for index_val, row in dataframe.iterrows():
        page_id = row['pid']
        terms = []
        for col in columns:
            val = row[col]
            trobat = re.findall(r"'([^']+)'", val)
            terms.extend(trobat)

        title = row['title']
        title = clean_df.loc[clean_df['pid'] == page_id, 'title'].values[0]
        title_index[page_id] = title

        current_page_index = {}

        for position, term in enumerate(terms):
            try:
                current_page_index[term][1].append(position)
            except:
                current_page_index[term] = [page_id, array('I', [position])]

        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        for term, posting in current_page_index.items():
            tf[term].append(np.round(len(posting[1]) / norm, 4))
            df[term] += 1

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)

    return index, tf, df, idf, title_index

In [26]:
# Create the index
indexing_columns = ['title', 'description', 'brand', 'category', 'sub_category', 'product_details', 'seller']
index, tf, df, idf, title_index = create_index_tfidf(processed_df, clean_df, indexing_columns, len(processed_df))

In [27]:
# Ranking
def rank_documents(terms, docs, index, idf, tf, title_index):
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)
    query_terms_count = collections.Counter(terms)
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]

    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    return result_docs, doc_scores

In [28]:
# Seraching
def search_tf_idf(query, index):
    query = preproces_text(query)
    docs = None
    for term in query:
        try:
            term_docs = [posting[0] for posting in index[term]]

            if docs is None:
                docs = term_docs
            else:
                docs &= term_docs
        except:
            pass
    docs = list(docs)
    ranked_docs,doc_scores = rank_documents(query, docs, index, idf, tf, title_index)
    return ranked_docs,doc_scores

In [29]:
# Key words
def print_ranked_keywords(tf, idf, top_n=20):
    tf_scores = {term: np.sum(scores) for term, scores in tf.items()}
    tfidf_scores = {term: tf_scores[term] * idf[term] for term in tf_scores}
    ranked_tf = sorted(tf_scores.items(), key=lambda x: x[1], reverse=True)
    ranked_tfidf = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)

    print(f"\n- Top {top_n} words by Term Frequency:")
    for term, score in ranked_tf[:top_n]:
        print(f"{term:20s} -> {score:.4f}")

    print(f"\n- Bottom {top_n} words by Term Frequency:")
    for term, score in ranked_tf[-top_n:]:
        print(f"{term:20s} -> {score:.4f}")

    print(f"\n- Top {top_n} words by TF-IDF:")
    for term, score in ranked_tfidf[:top_n]:
        print(f"{term:20s} -> {score:.4f}")

    print(f"\n- Bottom {top_n} words by TF-IDF:")
    for term, score in ranked_tfidf[-top_n:]:
        print(f"{term:20s} -> {score:.4f}")

    return ranked_tf, ranked_tfidf

ranked_tf, ranked_tfidf = print_ranked_keywords(tf, idf, top_n=20)


- Top 20 words by Term Frequency:
fabric               -> 5431.5729
neck                 -> 4856.2644
sleev                -> 4514.8395
color                -> 3557.8040
fit                  -> 3517.7120
type                 -> 3358.9679
cloth                -> 3144.6460
cotton               -> 3061.8074
accessori            -> 2963.5274
style                -> 2856.0454
pack                 -> 2794.7374
wear                 -> 2776.6680
men                  -> 2646.2992
pattern              -> 2623.7597
women                -> 2591.8913
code                 -> 2575.6623
round                -> 2454.2866
regular              -> 2395.8198
wash                 -> 2327.5701
solid                -> 2218.9387

- Bottom 20 words by Term Frequency:
stalk                -> 0.0301
resili               -> 0.0301
speci                -> 0.0301
poor                 -> 0.0301
soil                 -> 0.0301
fertil               -> 0.0301
seed                 -> 0.0301
oil                  -> 0.0301

In [30]:
# Search queries
queries = {
    1: "men cotton round neck clothes",
    2: "women and men neck accessories",
    3: "men casual black slim shirt",
    4: "women casual blue collar",
    5: "poor men biodegradable clothes"
}

top = 20

all_results = []
print("Our defined queries with the respective search:")
for query_id, query_text in queries.items():
    ranked_docs, scores = search_tf_idf(query_text, index)
    print("\n======================")
    print(f"Top {top} results out of {len(ranked_docs)} for the query: {query_text}\n")
    results = []
    for doc_id in ranked_docs[:top]:
        title = title_index[doc_id]
        results.append({
            "title": title,
            "pid": doc_id,
            "query_id": query_id
        })
        print(f"Document: {doc_id} | Title: {title}")
    all_results.extend(results)

queries_results_df = pd.DataFrame(all_results)

Our defined queries with the respective search:

Top 20 results out of 14232 for the query: men cotton round neck clothes

Document: TSHFWF58KHZT8BBX | Title: Solid Men Round Neck Blue T-Shirt
Document: TSHFTC3SWRUHU2VZ | Title: Solid Men Round Neck White, Black, Beige T-Shirt  (Pack of 3)
Document: TSHFWF57CHKBUNHE | Title: Solid Men Round Neck White T-Shirt
Document: TSHFUSKPWUZSP6F6 | Title: Printed Men Round Neck Black T-Shirt
Document: TSHFE8NYCNXY3AGV | Title: Printed Men Round Neck Red T-Shirt
Document: TSHFWF57UMK9QHDW | Title: Solid Men Round Neck Green T-Shirt
Document: TSHFYJR8YGMFCXKH | Title: Solid Men Round Neck White, Black T-Shirt  (Pack of 2)
Document: TSHFKG8EMETSYN5E | Title: Striped Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHEMA45XRMEG9YX | Title: Solid Men Round Neck Green T-Shirt
Document: TSHFWF589Z4GGVJ2 | Title: Solid Men Round Neck Red T-Shirt
Document: TSHFWUDVUVV3674W | Title: Printed Men Round Neck White T-Shirt
Document: TSHFWF58ZNPZTXYG |

EVALUATION

In [31]:
#Precision@K (P@K)

def precision_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[:-1]
    doc_score = doc_score[order[:k]]
    relevant = sum(doc_score == 1)
    return float(relevant) / k


In [32]:
#Recall@K (R@K)

def recall_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    relevant_retrieved = np.sum(doc_score[:k] == 1)
    total_relevant = np.sum(doc_score)
    return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0

In [33]:
#Average Precision@K (P@K)

def avg_precision_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]

    prec_at_i = 0
    prec_at_i_list = []
    number_of_relevant = 0
    number_to_iterate = min(k, len(order))

    for i in range(number_to_iterate):
        if doc_score[order[i]] == 1:
            number_of_relevant += 1
            prec_at_i = number_of_relevant / (i + 1)
            prec_at_i_list.append(prec_at_i)

    if number_of_relevant == 0:
        return 0
    else:
      return np.sum(prec_at_i_list) / number_of_relevant

In [34]:
#F1-Score@k

def f1_at_k(doc_score, y_score, k=10):
    p = precision_at_k(doc_score, y_score, k)
    r = recall_at_k(doc_score, y_score, k)
    return 2 * p * r / (p + r) if (p + r) > 0 else 0.0

In [35]:
#Mean Average Precision (MAP)
def map_at_k(search_res, k=10):
    avp = []
    for q in search_res["query_id"].unique():
        curr_data = search_res[search_res["query_id"] == q]
        avp.append(avg_precision_at_k(np.array(curr_data["is_relevant"]),
                   np.array(curr_data["predicted_relevance"]), k))
    return np.sum(avp) / len(avp), avp

In [36]:
#Mean Reciprocal Rank (MRR)

def rr_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[:-1]
    doc_score = np.take(doc_score, order[:k])
    if np.sum(doc_score) == 0:
        return 0
    return (np.argmax(doc_score == 1) + 1)

In [37]:
# Normalized Discounted Cumulative Gain (NDCG)

def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    gain = 2 ** doc_score - 1
    discounts = np.log2(np.arange(len(doc_score)) + 2)
    return np.sum(gain / discounts)

def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k)
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4)

In [38]:
# Evaluation
def evaluate_queries(dataframe, queries, k=10):
    for qid, query_text in queries.items():
        query_data = dataframe[dataframe['query_id'] == qid]

        scores = np.ones(len(query_data))
        labels = query_data['labels'].values

        print(f"\n==============================")
        print(f"Query {qid}: {query_text}")
        print(f"Documents: {len(query_data)}")
        print(f"k: {k}\n")

        print(f"Precision@{k}: {precision_at_k(labels, scores, k):.3f}")
        print(f"Recall@{k}: {recall_at_k(labels, scores, k):.3f}")
        print(f"Average Precision@{k}: {avg_precision_at_k(labels, scores, k):.3f}")
        print(f"F1-Score@{k}: {f1_at_k(labels, scores, k):.3f}")
        print(f"Mean Average Precision (MAP): {avg_precision_at_k(labels, scores, k):.3f}")
        print(f"Mean Reciprocal Rank (MRR): {rr_at_k(labels, scores, k):.3f}")
        print(f"NDCG@{k}: {ndcg_at_k(labels, scores, k):.3f}")
        print()

In [39]:
# Evaluation of validation.csv
evaluation_queries = {
    1: "women full sleeve sweatshirt cotton",
    2: "men slim jeans blue",
}

evaluate_queries(validation_df, evaluation_queries, k=10)



Query 1: women full sleeve sweatshirt cotton
Documents: 20
k: 10

Precision@10: 0.700
Recall@10: 0.538
Average Precision@10: 0.692
F1-Score@10: 0.609
Mean Average Precision (MAP): 0.692
Mean Reciprocal Rank (MRR): 1.000
NDCG@10: 0.571


Query 2: men slim jeans blue
Documents: 20
k: 10

Precision@10: 0.600
Recall@10: 0.600
Average Precision@10: 0.430
F1-Score@10: 0.600
Mean Average Precision (MAP): 0.430
Mean Reciprocal Rank (MRR): 1.000
NDCG@10: 0.332



In [40]:
# Evaluating the searches
display(queries_results_df)

queries_results_df["labels"] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
          1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
          1]

Unnamed: 0,title,pid,query_id
0,Solid Men Round Neck Blue T-Shirt,TSHFWF58KHZT8BBX,1
1,"Solid Men Round Neck White, Black, Beige T-Shi...",TSHFTC3SWRUHU2VZ,1
2,Solid Men Round Neck White T-Shirt,TSHFWF57CHKBUNHE,1
3,Printed Men Round Neck Black T-Shirt,TSHFUSKPWUZSP6F6,1
4,Printed Men Round Neck Red T-Shirt,TSHFE8NYCNXY3AGV,1
...,...,...,...
76,Striped Women Collared Neck Blue T-Shirt,TSHFVJ4462GRSQHZ,4
77,Printed Women Collared Neck Blue T-Shirt,TSHFTXRFGE7HU2VH,4
78,Women Regular Fit Solid Spread Collar Collar F...,SHTEJKHU5MCDCYYK,4
79,Women Regular Fit Solid Curved Collar Collar C...,SHTFM66VD6HRMFJZ,4


In [41]:
# Evaluation of our queries
evaluate_queries(queries_results_df, queries, k=10)


Query 1: men cotton round neck clothes
Documents: 20
k: 10

Precision@10: 1.000
Recall@10: 0.500
Average Precision@10: 1.000
F1-Score@10: 0.667
Mean Average Precision (MAP): 1.000
Mean Reciprocal Rank (MRR): 1.000
NDCG@10: 1.000


Query 2: women and men neck accessories
Documents: 20
k: 10

Precision@10: 0.300
Recall@10: 1.000
Average Precision@10: 0.000
F1-Score@10: 0.462
Mean Average Precision (MAP): 0.000
Mean Reciprocal Rank (MRR): 2.000
NDCG@10: 0.000


Query 3: men casual black slim shirt
Documents: 20
k: 10

Precision@10: 0.100
Recall@10: 0.333
Average Precision@10: 0.194
F1-Score@10: 0.154
Mean Average Precision (MAP): 0.194
Mean Reciprocal Rank (MRR): 1.000
NDCG@10: 0.308


Query 4: women casual blue collar
Documents: 20
k: 10

Precision@10: 0.700
Recall@10: 0.636
Average Precision@10: 0.565
F1-Score@10: 0.667
Mean Average Precision (MAP): 0.565
Mean Reciprocal Rank (MRR): 1.000
NDCG@10: 0.441


Query 5: poor men biodegradable clothes
Documents: 1
k: 10

Precision@10: 0.000
R