In [1]:
import os
import re
import random

In [2]:
class Extractor:
    def __init__(self,arr):
        self.train_root=arr[0]
        self.valid_root=arr[1]
        self.train_outputFile=arr[2]
        self.valid_outputFile=arr[3]
        self.vocabularyFile=arr[4]
        self.vocab=set()
    
    def _files_in_directory(self,root):
        files=[]
        for f_name in os.listdir(root):
            if f_name.endswith(".txt") and os.path.isfile(os.path.join(root,f_name)):
                files.append(f_name)
        return files
    
    def _strip_special_characters(self,word):
        pattern=r'[^a-zA-Z0-9\s]+'
        stripped_word=re.sub(pattern,'',word)
        return stripped_word
    
    def _get_words(self,text):
        words=text.split()
        list_of_words=[]
        for word in words:
            word=self._strip_special_characters(word)
            word=word.strip("~`!@#$%^&*()1234567890_-+={[]}\|'';:""/?.><,")
            if word:
                list_of_words.append(word)
        return list_of_words
    
    def get_vocabularyFile(self):
        print(f"Generating the vocabulary file..\n")
        train_files=self._files_in_directory(self.train_root)
        valid_files=self._files_in_directory(self.valid_root)
        total_train_files=len(train_files)
        total_valid_files=len(valid_files)
        
        with open(self.train_outputFile,"w",encoding="utf-8") as out_file:
            for f_name in train_files:
                file_path=os.path.join(train_root,f_name)
                with open(file_path,"r",encoding="utf-8") as in_file:
                    text=in_file.read()
                    out_file.write(text)
                    words=set(self._get_words(text))
                    self.vocab.update(words)

        with open(self.valid_outputFile,"w",encoding="utf-8") as out_file:
            for f_name in valid_files:
                file_path=os.path.join(valid_root,f_name)
                with open(file_path,"r",encoding="utf-8") as in_file:
                    text=in_file.read()
                    out_file.write(text)
                    words=set(self._get_words(text))
                    self.vocab.update(words)

        with open(self.vocabularyFile,"w",encoding="utf-8") as v_file:
            for word in self.vocab:
                v_file.write(word+'\n')
        print(f"Vocabulary File successfully generated!\n")
                
    def _seek_vocabularyFile(self,slice_num):
        with open(self.vocabularyFile,"r",encoding="utf-8") as f:
            text=f.read()
            words=sorted(self._get_words(text))
        vocab_size=len(words)
        print(f"Vocabulary File size: {vocab_size}\n")
        idx=random.randint(slice_num,vocab_size)
        print(f"Your Vocabulary File slice:\n {words[idx-slice_num:idx]}")

In [3]:
train_root="D:/LLM_Dataset/Books/Train"
valid_root="D:/LLM_Dataset/Books/Valid"
train_outputFile="D:/LLM_Dataset/output_train.txt"
valid_outputFile="D:/LLM_Dataset/output_valid.txt"
vocabularyFile="D:/LLM_Dataset/vocab.txt"

In [4]:
database=Extractor([train_root,valid_root,train_outputFile,valid_outputFile,vocabularyFile])

In [5]:
database.get_vocabularyFile()

Generating the vocabulary file..

Vocabulary File successfully generated!



In [6]:
database._seek_vocabularyFile(20)

Vocabulary File size: 381401

Your Vocabulary File slice:
 ['spirit', 'spiritalis', 'spirited', 'spiritedly', 'spiritedness', 'spiritful', 'spiritism', 'spiritist', 'spiritless', 'spiritlessness', 'spiritlike', 'spiritmoving', 'spiritous', 'spirits', 'spiritstirring', 'spiritual', 'spiritualis', 'spiritualiser', 'spiritualism', 'spiritualist']
