In [15]:
# Import necessary libraries
import pandas as pd
import json
import string 

# Load the CSV dataset
df = pd.read_csv("semi_strut.csv")

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Document ID,Content
0,1,"{\r\n ""title"": ""Introduction to Python"",\r\n..."
1,2,"{\r\n ""title"": ""Data Analysis with Pandas"",\..."
2,3,"{\r\n ""title"": ""Web Development with Flask"",..."
3,4,"{\r\n ""title"": ""Machine Learning with Scikit..."
4,5,"{\r\n ""title"": ""Data Visualization with Matp..."


In [33]:
# Tokenization function to extract terms from the JSON-like content
# Remember to exact both 
def tokenize_content(content):
    content_dict = json.loads(content)
    terms = []
    
    #1. Extract terms from various fields (title, author, keywords)
    terms.extend(content_dict.get("title", "").split())
    terms.extend(content_dict.get("author", "").split())
    terms.extend(content_dict.get("keywords", []))
    #2. Extract terms from sections' titles and content
    sections = content_dict.get("sections", [])
    for section in sections:
        terms.extend(section.get("title", "").split())
        terms.extend(section.get("content", "").split())
    return terms
tokenize_content(df["Content"][0])
# 3. apply to all row in panda df , by create new column "Terms"
df["Terms"] = df["Content"].apply(tokenize_content)


In [34]:
# 4. Implement a preprocessing function that converts terms to lowercase, removes punctuation, and removes common stop words.
    
def preprocess_terms(terms):
    # Define a set of common stop words
    stop_words = set([
        "a", "an", "the", "and", "is", "in", "it", "to", "of", "for", "on", "with", "as"
    ])
    
    # Remove punctuation and convert to lowercase
    terms = [term.lower().strip(string.punctuation) for term in terms]
    
    # Remove stop words
    terms = [term for term in terms if term not in stop_words]
    
    return terms

# Create another new column "Terms_preprocessed"d
df["Terms_preprocessed"] = df["Terms"].apply(preprocess_terms)

# 5. you can display the DataFrame
df[["Document ID", "Terms", "Terms_preprocessed"]]


Unnamed: 0,Document ID,Terms,Terms_preprocessed
0,1,"[Introduction, to, Python, John, Doe, Python, ...","[introduction, python, john, doe, python, prog..."
1,2,"[Data, Analysis, with, Pandas, Jane, Smith, Py...","[data, analysis, pandas, jane, smith, python, ..."
2,3,"[Web, Development, with, Flask, Mike, Johnson,...","[web, development, flask, mike, johnson, pytho..."
3,4,"[Machine, Learning, with, Scikit-Learn, Emily,...","[machine, learning, scikit-learn, emily, davis..."
4,5,"[Data, Visualization, with, Matplotlib, Robert...","[data, visualization, matplotlib, robert, clar..."


In [36]:
# Initialize an empty inverted index dictionary
inverted_index = {}

# Build the inverted index
for index, row in df.iterrows():
    document_id = row["Document ID"]
    terms = row["Terms_preprocessed"]
    
    # Update the inverted index with terms and document IDs
    for term in terms:
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(document_id)

# Display the inverted index
inverted_index

{'introduction': {1, 2, 4, 5},
 'python': {1, 2, 3, 4, 5},
 'john': {1},
 'doe': {1},
 'programming': {1},
 'beginner': {1},
 'getting': {1, 3},
 'started': {1, 3},
 'versatile': {1, 5},
 'language': {1},
 'basic': {1},
 'syntax': {1},
 'easy': {1},
 'understand': {1},
 'data': {2, 5},
 'analysis': {2},
 'pandas': {2},
 'jane': {2},
 'smith': {2},
 'data analysis': {2},
 'popular': {2},
 'library': {2, 5},
 'dataframes': {2},
 'are': {2},
 'core': {2},
 'structure': {2},
 'web': {3},
 'development': {3},
 'flask': {3},
 'mike': {3},
 'johnson': {3},
 'web development': {3},
 'lightweight': {3},
 'framework': {3},
 'routing': {3},
 'defines': {3},
 'url': {3},
 'patterns': {3},
 'views': {3},
 'machine': {4},
 'learning': {4},
 'scikit-learn': {4},
 'emily': {4},
 'davis': {4},
 'machine learning': {4},
 'subfield': {4},
 'artificial': {4},
 'intelligence': {4},
 'supervised': {4},
 'type': {4},
 'visualization': {5},
 'matplotlib': {5},
 'robert': {5},
 'clark': {5},
 'data visualizati

In [39]:
# perform boolean operations on postings lists for Boolean search operations
# or operation 
def or_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1):
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result

# and operation
def and_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
    return result
 
# 1. "Python" OR "Pandas"
pl_1 = list(inverted_index['python'])
pl_2 = list(inverted_index['pandas'])
or_postings(pl_1, pl_2) 
result_or = str(or_postings(pl_1, pl_2))
# 2. "Python" AND "data"
pl_3 = list(inverted_index['python'])
pl_4 = list(inverted_index['data'])
result_and = str(and_postings(pl_3, pl_4))

print("Python OR Pandas: "+result_or + "\nPython AND data: " + result_and)


Python OR Pandas: [1, 2, 3, 4, 5]
Python AND data: [2, 5]
