In [8]:
import os
import fitz  # PyMuPDF for PDFs
import pickle
import zipfile
from xml.etree.ElementTree import XML
from tqdm import tqdm

# --- DOCX extractor (your function) ---
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'

def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    try:
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = []
        for paragraph in tree.iter(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                paragraphs.append(''.join(texts))

        return '\n\n'.join(paragraphs)
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return ""

# --- PDF extractor (fast) ---
def extract_pdf_text(path):
    text = []
    try:
        doc = fitz.open(path)
        for page in doc:
            text.append(page.get_text("text"))
        doc.close()
    except Exception as e:
        print(f"Error reading {path}: {e}")
    return "\n".join(text)

# --- Build combined dataset ---
def build_dataset(folder_path, output_pkl="dataset.pkl"):
    data = {}

    for filename in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        if filename.lower().endswith(".pdf"):
            data[filename] = extract_pdf_text(file_path)
        elif filename.lower().endswith(".docx"):
            data[filename] = get_docx_text(file_path)
    
    with open(output_pkl, "wb") as f:
        pickle.dump(data, f)

    print(f"✅ Dataset saved as {output_pkl} ({len(data)} files).")



In [9]:
build_dataset("Dataset/")

100%|██████████| 738/738 [00:16<00:00, 43.84it/s]

✅ Dataset saved as dataset.pkl (735 files).





In [None]:
def bruteForce(string, pattern):
    n = len(string)
    m = len(pattern)

    if m > n:
        return []
    string = string.lower()
    pattern = pattern.lower()

    idx = []
    for i in range(n - m + 1):
        if string[i] == pattern[0]:
            found = True

            for j in range(m):
                if pattern[j] != string[i + j]:
                    found = False
                    break
            if(found):
                idx.append(i)
    
    return idx

In [12]:
print(bruteForce("Hello Hello", "he"))
# Output: [0, 6]


[0, 6]


In [14]:
bruteForce(get_docx_text('DataSet/23i0122.docx'), 'gold')

[2199]

In [16]:
def rabinKarp(string, pattern):
    n = len(string)
    m = len(pattern)

    base = 128
    mod = 509
    if m > n:
        return []

    string = string.lower()
    pattern = pattern.lower()

    h = pow(base, m-1, mod)

    hashString = 0
    hashPattern = 0

    for i in range(m):
        hashPattern = (base * hashPattern + ord(pattern[i])) % mod
        hashString = (base * hashString + ord(string[i])) % mod

    idx = []

    for i in range(n - m + 1):
        if hashPattern == hashString and string[i:i+m] == pattern:
            idx.append(i)

        if i < n - m:
            hashString = (base * (hashString - ord(string[i]) * h) + ord(string[i + m])) % mod
            if hashString < 0:
                hashString += mod

    return idx

In [17]:
rabinKarp(get_docx_text('DataSet/23i0122.docx'), 'gold')

[2199]

In [18]:
def prefix(p):
    m = len(p)
    Pi = [0] * m
    k = 0  # length of current longest prefix-suffix

    for q in range(1, m):  # 1-based in pseudo, 0-based in Python
        while k > 0 and p[k] != p[q]:
            k = Pi[k - 1]
        if p[k] == p[q]:
            k += 1
        Pi[q] = k
    return Pi


def kmp_matcher(S, P):
    n = len(S)
    m = len(P)
    Pi = prefix(P)
    q = 0  # number of characters matched
    matches = []

    for i in range(n):
        while q > 0 and P[q] != S[i]:
            q = Pi[q - 1]
        if P[q] == S[i]:
            q += 1
        if q == m:
            matches.append(i - m + 1)  # found match ending at i
            q = Pi[q - 1]  # prepare for next possible match
    return matches
