In [1]:
# Import necessary libraries
import pandas as pd
import json
import string 

# Load the CSV dataset
df = pd.read_csv("semi_strut.csv")

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Document ID,Content
0,1,"{\r\n ""title"": ""Introduction to Python"",\r\n..."
1,2,"{\r\n ""title"": ""Data Analysis with Pandas"",\..."
2,3,"{\r\n ""title"": ""Web Development with Flask"",..."
3,4,"{\r\n ""title"": ""Machine Learning with Scikit..."
4,5,"{\r\n ""title"": ""Data Visualization with Matp..."


In [2]:
# Tokenization function to extract terms from the JSON-like content
# Remember to exact both 
# 1 .Extract terms from various fields (title, author)
def tokenize_content(content):
    content_dict = json.loads(content)
    terms = []
    #     Extract terms from various fields (title, author)
    terms.extend(content_dict.get('title').split())
    terms.extend(content_dict.get('author').split())
    terms.extend(content_dict.get('keywords'))
    
    for x in content_dict['sections']:
        terms.extend(x.get('title').split())
        terms.extend(x.get('content').split())
    return terms

tokenize_content(df["Content"][0])
# 2. apply to all row in panda df , by create new column "Terms"
df["Terms"] = df['Content'].apply(tokenize_content)
df

Unnamed: 0,Document ID,Content,Terms
0,1,"{\r\n ""title"": ""Introduction to Python"",\r\n...","[Introduction, to, Python, John, Doe, Python, ..."
1,2,"{\r\n ""title"": ""Data Analysis with Pandas"",\...","[Data, Analysis, with, Pandas, Jane, Smith, Py..."
2,3,"{\r\n ""title"": ""Web Development with Flask"",...","[Web, Development, with, Flask, Mike, Johnson,..."
3,4,"{\r\n ""title"": ""Machine Learning with Scikit...","[Machine, Learning, with, Scikit-Learn, Emily,..."
4,5,"{\r\n ""title"": ""Data Visualization with Matp...","[Data, Visualization, with, Matplotlib, Robert..."


In [3]:
# 4. Implement a preprocessing function that converts terms to lowercase, removes punctuation, and removes common stop words.
    # Create another new column "Terms_preprocessed"
    
import nltk
import string

# Download NLTK stopwords if not already installed
from nltk.corpus import stopwords

def preprocess_terms(terms):
    
     # Remove common stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in terms if word not in stop_words]
    
    # Remove punctuation and convert to lowercase
    terms = [term.lower().strip(string.punctuation) for term in words]
    
    
    unique_terms = list(set(terms))
    
    return unique_terms

# df["Terms_prep"]
df["Terms_prep"] = df["Terms"].apply(preprocess_terms)

df["Terms_prep"][0]

['introduction',
 'getting',
 'understand',
 'language',
 'basic',
 'python',
 'beginner',
 'started',
 'easy',
 'programming',
 'john',
 'syntax',
 'versatile',
 'doe']

In [4]:

# Initialize an empty inverted index dictionary
inverted_index = {}

# Build the inverted index
for index, row in df.iterrows():
    document_id = row["Document ID"]
    terms = row["Terms_prep"]
    
    # Update the inverted index with terms and document IDs
    for term in terms:
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(document_id)

# Display the inverted index
inverted_index

{'introduction': {1, 2, 4, 5},
 'getting': {1, 3},
 'understand': {1},
 'language': {1},
 'basic': {1},
 'python': {1, 2, 3, 4, 5},
 'beginner': {1},
 'started': {1, 3},
 'easy': {1},
 'programming': {1},
 'john': {1},
 'syntax': {1},
 'versatile': {1, 5},
 'doe': {1},
 'analysis': {2},
 'data analysis': {2},
 'core': {2},
 'jane': {2},
 'data': {2, 5},
 'library': {2, 5},
 'dataframes': {2},
 'structure': {2},
 'popular': {2},
 'smith': {2},
 'pandas': {2},
 'routing': {3},
 'views': {3},
 'url': {3},
 'flask': {3},
 'lightweight': {3},
 'defines': {3},
 'patterns': {3},
 'mike': {3},
 'johnson': {3},
 'framework': {3},
 'web': {3},
 'web development': {3},
 'development': {3},
 'scikit-learn': {4},
 'davis': {4},
 'subfield': {4},
 'emily': {4},
 'artificial': {4},
 'machine': {4},
 'supervised': {4},
 'type': {4},
 'machine learning': {4},
 'learning': {4},
 'intelligence': {4},
 'robert': {5},
 'clark': {5},
 'various': {5},
 'using': {5},
 'data visualization': {5},
 'creating': {

In [5]:
# Test search a word
post = inverted_index['easy']
post

{1}

In [6]:
# perform boolean operations on postings lists for Boolean search operations
# 1. "Python" OR "Pandas"

def or_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1):
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result



In [15]:
# 1. "Python" OR "Pandas"
q1 = "Python".lower()
q2 = "Pandas".lower()
pl_1 = list(inverted_index[q1])
pl_2 = list(inverted_index[q2])
or_postings(pl_1, pl_2) 

[1, 2, 3, 4, 5]

In [8]:
# 2. "Python" AND "data"

def and_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
    return result

In [16]:
q1 = "Python".lower()
q2 = "data".lower()
pl_1 = list(inverted_index[q1])
pl_2 = list(inverted_index[q2])
and_postings(pl_1, pl_2) 

[2, 5]