In [6]:
from pathlib import Path
import sqlite3

class DocumentProcessor():
    def __init__(self, document_text: str):
        self.doc_txt = document_text

    def process_text(self):
        doc_lines = self.doc_txt.split("\n")
        doc_lines = [line.strip() for line in doc_lines]
        self.doc_txt = " ".join(doc_lines)
        self.tokens = self.doc_txt.split()
        self.vocab = set(self.tokens)

class Indexer():
    def __init__(self, db_file: str = "index_sqlite.db"):
        self.connection = sqlite3.connect(db_file)
        self.cur = self.connection.cursor()
        self.setup()

    def setup(self):
        self.cur.execute('''CREATE TABLE IF NOT EXISTS doc_index
                            (doc_id TEXT, word TEXT, count INTEGER)''')
        self.connection.commit()

    def index(self, doc_id: str, content: str):
        processor = DocumentProcessor(content)
        processor.process_text()
        word_counts = {}
        for word in processor.tokens:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1
        for word, count in word_counts.items():
            self.cur.execute("INSERT INTO doc_index (doc_id, word, count) VALUES (?, ?, ?)",
                             (doc_id, word, count))
        self.connection.commit()

    def close(self):
        self.connection.close()

if __name__ == "__main__":
    indexer = Indexer("anime_index.db")
    data_folder = Path("C:/Users/Asus/Liad-Indexing/data/Anime")
    docs = data_folder.glob("*.txt")
    for doc_path in docs:
        # Extract the unique number from the filename
        doc_id = doc_path.stem  # e.g., 'train-anime-1'
        doc_id = doc_id.split('-')[-1]  # Split by '-' and take the last part, which is the number

        with open(doc_path, 'r', encoding='utf-8') as file:
            content = file.read()
        indexer.index(doc_id, content)
    indexer.close()

In [8]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('anime_index.db')
cursor = conn.cursor()

# Query to fetch data
cursor.execute("SELECT * FROM doc_index LIMIT 10000")  # Adjust query as needed

# Fetch and print the results
results = cursor.fetchall()
for row in results:
    print(row)

# Close the connection
conn.close()


('0', 'context:', 549)
('0', 'Original', 8)
('0', 'Translation', 8)
('0', 'by', 97)
('0', 'Triad/', 8)
('0', 'Softsub', 8)
('0', 'Urusai', 1)
('0', 'response:', 549)
('0', 'You', 67)
('0', 'may', 18)
('0', 'not', 72)
('0', 'believe', 27)
('0', 'me,', 27)
('0', 'but...', 19)
('0', 'Urusai\\You', 7)
('0', 'Yano...', 12)
('0', 'Motoharu.', 4)
('0', 'but...\\Yano...', 7)
('0', 'Usotsuki', 2)
('0', 'na', 47)
('0', 'anata', 18)
('0', 'nakimushi', 9)
('0', 'no', 162)
('0', 'atashi', 11)
('0', 'Motoharu.\\Usotsuki', 7)
('0', 'Doushite', 4)
('0', 'umaku', 9)
('0', 'ikanai', 9)
('0', 'wa', 67)
('0', 'atashi\\Doushite', 7)
('0', 'Oshare', 2)
('0', 'shita', 9)
('0', 'kami', 9)
('0', 'mo', 83)
('0', 'rain', 9)
('0', 'suton', 9)
('0', 'tsume', 9)
('0', 'wa\\Oshare', 7)
('0', 'Anata', 6)
('0', 'kidzuki', 9)
('0', 'shinai', 9)
('0', 'ne', 4)
('0', 'mo\\Anata', 7)
('0', 'Love', 4)
('0', 'kanashii', 9)
('0', 'namida', 9)
('0', 'nanka', 9)
('0', 'ja', 9)
('0', 'nai', 9)
('0', 'yo', 4)
('0', 'ne\\Love', 7