In [1]:
# Imports
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
from collections import defaultdict
from array import array
import numpy as np
import math
import collections
from numpy import linalg as la
import re

!pip3 install gensim
from gensim.models import Word2Vec

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Load data
processed_df = pd.read_csv("/content/drive/Shareddrives/IRWA/P3/processed_df.csv")
clean_df = pd.read_csv("/content/drive/Shareddrives/IRWA/P3/clean_df.csv")

In [3]:
# Part1 function
def preproces_text(text):
    if not isinstance(text, str):
        return []

    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]

    return tokens

In [4]:
# Part 2 queries
queries = {
    1: "men cotton round neck clothes",
    2: "women and men neck accessories",
    3: "men casual black slim shirt",
    4: "women casual blue collar",
    5: "poor men biodegradable clothes"
}

Excercisi 1a)


In [5]:
# Indexing including doc_length and avg_doc_lenght
def create_index_tfidf_bm25(dataframe, clean_df, columns, num_documents):
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    title_index = defaultdict(str)
    idf = defaultdict(float)
    doc_lengths = {}
    total_length = 0

    for index_val, row in dataframe.iterrows():
        page_id = row['pid']
        terms = []
        for col in columns:
            val = row[col]
            trobat = re.findall(r"'([^']+)'", val)
            terms.extend(trobat)

        title = row['title']
        title = clean_df.loc[clean_df['pid'] == page_id, 'title'].values[0]
        title_index[page_id] = title

        current_page_index = {}

        for position, term in enumerate(terms):
            try:
                current_page_index[term][1].append(position)
            except:
                current_page_index[term] = [page_id, array('I', [position])]

        norm = 0
        for term, posting in current_page_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        doc_lengths[page_id] = len(terms)
        total_length += len(terms)

        for term, posting in current_page_index.items():
            tf[term].append(np.round(len(posting[1]) / norm, 4))
            df[term] += 1

        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)

    avg_doc_length = total_length / num_documents

    return index, tf, df, idf, title_index, doc_lengths, avg_doc_length

In [6]:
# Create the index
indexing_columns = ['title', 'description', 'brand', 'category', 'sub_category', 'product_details', 'seller']
index, tf, df, idf, title_index, doc_lengths, avg_doc_length = create_index_tfidf_bm25(processed_df, clean_df, indexing_columns, len(processed_df))

In [7]:
# Ranking TF-IDF + cosine similarity
def rank_documents_tf_idf_cos(terms, docs, index, idf, tf):
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)
    query_terms_count = collections.Counter(terms)
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]

    query_vector = np.array(query_vector, dtype=float)
    query_norm = la.norm(query_vector)
    if query_norm == 0:
        return [], []

    doc_scores = []
    for doc, curDocVec in doc_vectors.items():
        curDocVec = np.array(curDocVec, dtype=float)
        doc_norm = la.norm(curDocVec)
        if doc_norm == 0:
            continue
        dot_score = float(np.dot(curDocVec, query_vector))
        cos_sim = float(dot_score / (doc_norm * query_norm))
        doc_scores.append([cos_sim, doc])

    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf_cos(query, index)
    return result_docs, doc_scores

In [8]:
# Search with the TF-IDF + cosine similarity rank function
def search_tf_idf_cos(query, index, idf, tf):
    query = preproces_text(query)
    docs = None
    for term in query:
        try:
            term_docs = [posting[0] for posting in index[term]]

            if docs is None:
                docs = term_docs
            else:
                docs &= term_docs
        except:
            pass
    docs = list(docs)
    ranked_docs,doc_scores = rank_documents_tf_idf_cos(query, docs, index, idf, tf)
    return ranked_docs,doc_scores

In [9]:
# Print the top 20 of the Part2 queries(with TF-IDF + cosine similarity score)
top = 20
print("Our defined queries with the respective search (with TF-IDF + cosine similarity score):")
for query_id, query_text in queries.items():
    ranked_docs, scores = search_tf_idf_cos(query_text, index, idf, tf)
    print("\n======================")
    print(f"Top {top} results out of {len(ranked_docs)} for the query: {query_text}\n")
    for doc_id in ranked_docs[:top]:
        title = title_index[doc_id]
        print(f"Document: {doc_id} | Title: {title}")

Our defined queries with the respective search (with TF-IDF + cosine similarity score):

Top 20 results out of 14232 for the query: men cotton round neck clothes

Document: SWSF7NWDK2FMWHVQ | Title: Full Sleeve Self Design Men Sweatshirt
Document: TSHFGK39PUWYFPXB | Title: Printed Men Round Neck Maroon, Beige T-Shirt
Document: TSHFGJXFD9ZPQPGD | Title: Printed Men Round Neck White, Maroon T-Shirt
Document: TSHFGJX75VHXH2TP | Title: Color Block Men Round Neck White, Maroon T-Shirt
Document: TSHFGJXFMGFC487C | Title: Printed Men Round Neck Maroon, Beige T-Shirt
Document: TSHFGJX73DGYA27D | Title: Color Block Men Round Neck Black, Yellow T-Shirt
Document: TSHFMGMYWTKUAGQB | Title: Printed Men Round Neck Black T-Shirt
Document: TSHFHBSNXBHDF6AF | Title: Printed Men Round Neck Black T-Shirt
Document: TSHFHBSNSXSMUDPH | Title: Printed Men Round Neck White T-Shirt
Document: TSHFGZGBWSZXERPM | Title: Color Block Men Round Neck White T-Shirt
Document: TSHFTW29PUDGZVBS | Title: Solid Men Round N

Excercisi 1b)

In [10]:
# Ranking BM25
def rank_documents_bm25(terms, docs, index, idf, doc_length, avgdl, k1=1.2, b=0.75):
    N = len(doc_length)
    doc_scores = defaultdict(float)
    for termIndex, term in enumerate(terms):
        if term not in index:
            continue
        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                tf_td = len(postings)
                Ld = doc_length[doc]
                denom = k1 * ((1 - b) + b * (Ld / avgdl)) + tf_td
                score = idf[term] * ((k1 + 1) * tf_td) / denom
                doc_scores[doc] += score

    doc_scores = [[score, doc] for doc, score in doc_scores.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_bm25(query, index, idf, doc_length, avgdl)
    return result_docs, doc_scores


In [11]:
# Search with the BM25 rank function
def search_bm25(query, index, idf, doc_length, avgdl):
    query = preproces_text(query)
    docs = None
    for term in query:
        try:
            term_docs = [posting[0] for posting in index[term]]
            if docs is None:
                docs = term_docs
            else:
                docs &= term_docs
        except:
            pass
    docs = list(docs)
    ranked_docs,doc_scores = rank_documents_bm25(query, docs, index, idf, doc_length, avgdl)
    return ranked_docs,doc_scores

In [12]:
# Print the top 20 of the Part2 queries (with BM25 score)
top = 20
print("Our defined queries with the respective search (with TF-IDF + cosine similarity score):")
for query_id, query_text in queries.items():
    ranked_docs, scores = search_bm25(query_text, index, idf, doc_lengths, avg_doc_length)
    print("\n======================")
    print(f"Top {top} results out of {len(ranked_docs)} for the query: {query_text}\n")
    for doc_id in ranked_docs[:top]:
        title = title_index[doc_id]
        print(f"Document: {doc_id} | Title: {title}")

Our defined queries with the respective search (with TF-IDF + cosine similarity score):

Top 20 results out of 14232 for the query: men cotton round neck clothes

Document: TSHFWF58KHZT8BBX | Title: Solid Men Round Neck Blue T-Shirt
Document: TSHFWF57CHKBUNHE | Title: Solid Men Round Neck White T-Shirt
Document: TSHFKQJRZXJ8ZHYB | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRJKFM6UGW | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRDDGJJGT4 | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRDA6FPV7B | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRBQQWXPTG | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKG8EMETSYN5E | Title: Striped Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRGXT5EV5G | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRGPZGYYDG | Title: Printed Men Round Neck 

Exercici 1 c)

In [13]:
# Versió fumada no fara falta
def create_index(df):
    """
    Crea un reverse index amb format:
    token → [ [pid, [field, [positions]], [field, [positions]], ...], ... ]
    """
    index = defaultdict(list)
    text_fields = ["title", "description", "brand", "category", "sub_category", "product_details", "seller"]

    for _, row in df.iterrows():
        pid = row["pid"]

        # index temporal per aquest document
        current_doc_index = defaultdict(lambda: defaultdict(lambda: array('I')))

        for field in text_fields:
            field_value = row[field]

            terms = [] # tokens

            for pos, term in enumerate(terms):
                current_doc_index[term][field].append(pos)

        # Afegim les dades del document a l’índex global
        for term, field_positions in current_doc_index.items():
            entry = [pid] + [[field, positions] for field, positions in field_positions.items()]
            index[term].append(entry)

    return index


In [14]:
# Reverse index for fields
def create_fields_index(dataframe, columns):
    index = defaultdict(list)
    for _, row in dataframe.iterrows():
        page_id = row['pid']
        term_fields = defaultdict(set)
        for col in columns:
            val = row[col]
            terms = re.findall(r"'([^']+)'", val)
            for term in terms:
                term_fields[term].add(col)

        for term, fields in term_fields.items():
            index[term].append([page_id, list(fields)])
    return index

In [15]:
# Create the field reverse index
indexing_columns = ['title', 'description', 'brand', 'category', 'sub_category', 'product_details', 'seller']
field_index = create_fields_index(processed_df, indexing_columns)

In [16]:
# Field weights
field_weights = {
    "title": 0.9,
    "brand": 0.25,
    "category": 0.25,
    "sub_category": 0.125,
    "description": 0.125,
    "product_details": 0.25,
    "seller": 0.1
}

In [17]:
# Ranking our socre
def rank_documents_ours(terms, docs, index, field_index, idf, doc_length, avgdl, k1=1.2, b=0.75):
    N = len(doc_length)
    doc_scores = defaultdict(float)
    for termIndex, term in enumerate(terms):
        if term not in index:
            continue
        term_field_map = {}
        if field_index is not None and term in field_index:
            for pid, fields in field_index[term]:
                term_field_map[pid] = fields

        for doc_index, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                tf_td = len(postings)
                Ld = doc_length[doc]
                denom = k1 * ((1 - b) + b * (Ld / avgdl)) + tf_td
                score = idf[term] * ((k1 + 1) * tf_td) / denom

                fields = term_field_map.get(doc, [])
                if fields:
                    field_coeff = sum(field_weights.get(f, 0.0) for f in fields)
                else:
                    field_coeff = 0.0
                score = score * field_coeff

                doc_scores[doc] += score

    doc_scores = [[score, doc] for doc, score in doc_scores.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_ours(query, index, field_index, idf, doc_length, avgdl)
    return result_docs, doc_scores

In [18]:
# Search with our rank function
def search_ours(query, index, field_index, idf, doc_length, avgdl):
    query = preproces_text(query)
    docs = None
    for term in query:
        try:
            term_docs = [posting[0] for posting in index[term]]
            if docs is None:
                docs = term_docs
            else:
                docs &= term_docs
        except:
            pass
    docs = list(docs)
    ranked_docs,doc_scores = rank_documents_ours(query, docs, index, field_index, idf, doc_length, avgdl)
    return ranked_docs,doc_scores

In [19]:
# Print the top 20 of the Part2 queries (with our score)
top = 20
print("Our defined queries with the respective search (with our score):")
for query_id, query_text in queries.items():
    ranked_docs, scores = search_ours(query_text, index, field_index, idf, doc_lengths, avg_doc_length)
    print("\n======================")
    print(f"Top {top} results out of {len(ranked_docs)} for the query: {query_text}\n")
    for doc_id in ranked_docs[:top]:
        title = title_index[doc_id]
        print(f"Document: {doc_id} | Title: {title}")

Our defined queries with the respective search (with our score):

Top 20 results out of 14232 for the query: men cotton round neck clothes

Document: TSHFWF58KHZT8BBX | Title: Solid Men Round Neck Blue T-Shirt
Document: TSHFWF57CHKBUNHE | Title: Solid Men Round Neck White T-Shirt
Document: TSHFKQJRZXJ8ZHYB | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRJKFM6UGW | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRDDGJJGT4 | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRDA6FPV7B | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRBQQWXPTG | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKG8EMETSYN5E | Title: Striped Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRGXT5EV5G | Title: Printed Men Round Neck Multicolor T-Shirt  (Pack of 2)
Document: TSHFKQJRGPZGYYDG | Title: Printed Men Round Neck Multicolor T-Shirt  (Pa

Exercici 2

In [20]:
#create word2Vec model
indexing_columns = ['title', 'description', 'brand', 'category', 'sub_category', 'product_details', 'seller']
sentences = []
for index_val, row in processed_df.iterrows():
    page_id = row['pid']
    terms = []
    for col in indexing_columns:
        val = row[col]
        trobat = re.findall(r"'([^']+)'", val)
        terms.extend(trobat)
    if len(terms) > 0:
        sentences.append(terms)

w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=10, min_count=10, negative=15, sg=0)

In [21]:
# doc_vector dictionary
docs_vectors = {}
dim = w2v_model.vector_size
for index_val, row in processed_df.iterrows():
    pid = row['pid']
    terms = []
    for col in indexing_columns:
        val = row[col]
        trobat = re.findall(r"'([^']+)'", val)
        terms.extend(trobat)

    word_vecs = []
    for t in terms:
        if t in w2v_model.wv:
            word_vecs.append(w2v_model.wv[t])

    if word_vecs:
        doc_vec = np.mean(word_vecs, axis=0)
    else:
        doc_vec = np.zeros(dim)

    docs_vectors[pid] = doc_vec

In [22]:
# Ranking word2vec score
def rank_documents_word2vec_cos(terms, docs, index, docs_vectors, w2v_model):
    query_w2v_embs = []
    for t in terms:
        if t in w2v_model.wv:
            query_w2v_embs.append(w2v_model.wv[t])

    if len(query_w2v_embs) > 0:
        query_w2v_vec = np.mean(query_w2v_embs, axis=0)
        query_w2v_norm = la.norm(query_w2v_vec)
        dim = query_w2v_vec.shape[0]
        doc_w2v_vectors = defaultdict(lambda: np.zeros(dim, dtype=float))
    else:
        query_w2v_vec = None
        query_w2v_norm = 0.0

    if query_w2v_norm == 0:
        print("No results found, try again")
        query = input()
        docs = search_word2vec_cos(query, index, docs_vectors, w2v_model)

    doc_scores = []
    for doc, curDocVec in docs_vectors.items():
        if doc in docs:
            curDocVec = np.array(curDocVec, dtype=float)
            doc_norm = la.norm(curDocVec)
            if doc_norm == 0:
                continue
            dot_score = float(np.dot(curDocVec, query_w2v_vec))
            cos_sim = float(dot_score / (doc_norm * query_w2v_norm))
            doc_scores.append([cos_sim, doc])

    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_word2vec_cos(query, index, docs_vectors, w2v_model)
    return result_docs, doc_scores

In [23]:
# Search with the word2vec rank function
def search_word2vec_cos(query, index, docs_vectors, w2v_model):
    query = preproces_text(query)
    docs = None
    for term in query:
        try:
            term_docs = [posting[0] for posting in index[term]]
            if docs is None:
                docs = term_docs
            else:
                docs &= term_docs
        except:
            pass
    docs = list(docs)
    ranked_docs,doc_scores = rank_documents_word2vec_cos(query, docs, index, docs_vectors, w2v_model)
    return ranked_docs,doc_scores

In [24]:
# Print the top 20 of the Part2 queries (with xord2vec + cosine score)
top = 20
print("Our defined queries with the respective search (with word2vec + cosine score):")
for query_id, query_text in queries.items():
    ranked_docs, scores = search_word2vec_cos(query_text, index, docs_vectors, w2v_model)
    print("\n======================")
    print(f"Top {top} results out of {len(ranked_docs)} for the query: {query_text}\n")
    for doc_id in ranked_docs[:top]:
        title = title_index[doc_id]
        print(f"Document: {doc_id} | Title: {title}")

Our defined queries with the respective search (with word2vec + cosine score):

Top 20 results out of 14232 for the query: men cotton round neck clothes

Document: TSHFHUSFGYYARGZJ | Title: Self Design Men Round Neck Grey T-Shirt
Document: TSHFWBWEZHVQEHYZ | Title: Solid Men Round Neck Grey T-Shirt  (Pack of 3)
Document: TSHFWAWKUBZ9DQMX | Title: Solid Men Round Neck Light Blue T-Shirt  (Pack of 5)
Document: TSHFWBWESMCTBUKC | Title: Solid Men Round Neck Dark Blue T-Shirt  (Pack of 3)
Document: TSHFWAWKZ89QFZSQ | Title: Solid Men Round Neck Multicolor T-Shirt  (Pack of 5)
Document: TSHFWAWKHA7VEBDW | Title: Solid Men Round Neck Black T-Shirt  (Pack of 5)
Document: TSHFWBWEDVTTGYFG | Title: Solid Men Round Neck Multicolor T-Shirt  (Pack of 3)
Document: TSHFWAWKG67G9FUX | Title: Solid Men Round Neck Grey T-Shirt  (Pack of 5)
Document: TSHFW6HDXGGNXSYG | Title: Solid Men Round Neck Grey T-Shirt
Document: TSHFWBWEGVAUJZMK | Title: Solid Men Round Neck Light Blue T-Shirt  (Pack of 3)
Docume