## Load the Dataset and Pick a Column (social media dataset)

In [42]:
import pandas as pd

df = pd.read_csv("SocialMedia.csv")

docs = df['Text'].dropna().astype(str).tolist()

# Print first 10 row
for i, doc in enumerate(docs[:10]):
    print(f"{i+1}: {doc}")


1:  Enjoying a beautiful day at the park!              
2:  Traffic was terrible this morning.                 
3:  Just finished an amazing workout! 💪               
4:  Excited about the upcoming weekend getaway!        
5:  Trying out a new recipe for dinner tonight.        
6:  Feeling grateful for the little things in life.    
7:  Rainy days call for cozy blankets and hot cocoa.   
8:  The new movie release is a must-watch!             
9:  Political discussions heating up on the timeline.  
10:  Missing summer vibes and beach days.               


## Define Queries

In [49]:
queries = [
    "beautiful day",
    "Enjoying every moment",
    "Gratitude for the support received",
    "Compassion shown through acts of kindness in the community",
    "Determination burning like a wildfire, overcoming obstacles, turning dreams into reality",
    "Overwhelmed by the cacophony of expectations",
    "worst experience ever",
    "Eyes wide open in the night",
    "In the celebration",
    "Wandering through the historical streets of Kyoto, each step a journey into the heart of Japan's traditions"
]

## Biword Indexing and Search

In [50]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

def build_biword_index(docs):
    biword_index = {}
    for doc_id, doc in enumerate(docs):
        words = preprocess_text(doc)
        for i in range(len(words) - 1):
            biword = (words[i], words[i + 1])
            if biword in biword_index.keys():
                biword_index[biword] += [doc_id]
            else:
                biword_index[biword] = [doc_id]
    return biword_index

def search_biword_index(biword_index, query):
    words = preprocess_text(query)
    if len(words) < 2:
        return set()  
    
    query_biwords = [(words[i], words[i + 1]) for i in range(len(words) - 1)]

    result_docs = None
    for biword in query_biwords:
        if biword in biword_index:
            if result_docs is None:
                result_docs = set(biword_index[biword])  
            else:
                result_docs = result_docs.intersection(biword_index[biword])
        else:
            return set()

    return result_docs if result_docs else set()


# build
biword_index = build_biword_index(docs)

# Perform search
for query in queries :
    result = search_biword_index(biword_index, query)

    print("Query:", query)
    print("Number of Matching docs:", len(result))
    print("Matching Document IDs:", result)
    print("Results found:")
    for doc_id in result:
        print("Document:", doc_id, "->", docs[doc_id])
    print('\n')    
    

Query: beautiful day
Number of Matching docs: 2
Matching Document IDs: {0, 82}
Results found:
Document: 0 ->  Enjoying a beautiful day at the park!              
Document: 82 ->  Sending love to all my followers on this beautiful day! ❤️ 


Query: Enjoying every moment
Number of Matching docs: 1
Matching Document IDs: {84}
Results found:
Document: 84 ->  Enjoying every moment of this trip—pure enjoyment!      


Query: Gratitude for the support received
Number of Matching docs: 1
Matching Document IDs: {126}
Results found:
Document: 126 ->  Gratitude for the support received during tough times. 


Query: Compassion shown through acts of kindness in the community
Number of Matching docs: 1
Matching Document IDs: {143}
Results found:
Document: 143 ->  Compassion shown through acts of kindness in the community.


Query: Determination burning like a wildfire, overcoming obstacles, turning dreams into reality
Number of Matching docs: 1
Matching Document IDs: {214}
Results found:
Document: 2

## Positional Index

In [51]:
from collections import defaultdict

def preprocess_text(text):
    import re
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

def build_positional_index(documents):

    positional_index = defaultdict(list)

    for doc_id, document in enumerate(documents):
        words = preprocess_text(document)
        for pos, word in enumerate(words):
            # Append the position to the term's entry
            if positional_index[word] and positional_index[word][-1][0] == doc_id:
                positional_index[word][-1][1].append(pos)
            else:
                positional_index[word].append((doc_id, [pos]))

    return positional_index

def search_phrase(positional_index, query, documents):

    words = preprocess_text(query)
    if not words:
        return set()

    # Find the list of (doc_id, positions) for each word in the query
    word_positions = [positional_index.get(word, []) for word in words]
    if not all(word_positions):
        return set()

    # Find documents containing all words with correct relative positions
    result_docs = set(doc_id for doc_id, _ in word_positions[0])
    for i in range(1, len(words)):
        next_result_docs = set()
        for doc_id, positions in word_positions[i]:
            for prev_doc_id, prev_positions in word_positions[i - 1]:
                if doc_id == prev_doc_id:
                    if any(pos + 1 in positions for pos in prev_positions):
                        next_result_docs.add(doc_id)
                        break
        result_docs &= next_result_docs
        if not result_docs:
            return set()

    return result_docs


# build
positional_index = build_positional_index(docs)

# Perform search
for query in queries:
    result = search_phrase(positional_index, query, docs)

    print("Query:", query)
    print("Number of Matching docs:", len(result))
    print("Matching Document IDs:", result)
    print("Results found:")
    for doc_id in result:
        print("Document:", doc_id, "->", docs[doc_id])
    print('\n')

Query: beautiful day
Number of Matching docs: 2
Matching Document IDs: {0, 82}
Results found:
Document: 0 ->  Enjoying a beautiful day at the park!              
Document: 82 ->  Sending love to all my followers on this beautiful day! ❤️ 


Query: Enjoying every moment
Number of Matching docs: 1
Matching Document IDs: {84}
Results found:
Document: 84 ->  Enjoying every moment of this trip—pure enjoyment!      


Query: Gratitude for the support received
Number of Matching docs: 1
Matching Document IDs: {126}
Results found:
Document: 126 ->  Gratitude for the support received during tough times. 


Query: Compassion shown through acts of kindness in the community
Number of Matching docs: 1
Matching Document IDs: {143}
Results found:
Document: 143 ->  Compassion shown through acts of kindness in the community.


Query: Determination burning like a wildfire, overcoming obstacles, turning dreams into reality
Number of Matching docs: 1
Matching Document IDs: {214}
Results found:
Document: 2

### Homework

In [None]:
# Homework 1:
# Implement the skip pointer

# Homework 2:
# what is the order of skip pointers in both the best and worst cases.

# What are the advantages and disadvantages of positional and biword retrieval. Compare them in a table.