## Task : we have to create a personal File converter Irrespective of file type to .txt file

In [6]:
import pandas as pd
import json
from PyPDF2 import PdfReader

In [32]:
import pandas as pd
import json
from PyPDF2 import PdfReader

def convert_any_file_to_txt(input_file, output_file=None):
    input_file = input_file.strip()
    ext = input_file.lower().split('.')[-1]

    if output_file is None:
        output_file = ".".join(input_file.split('.')[:-1]) + ".txt"

    content = ""

    try:
        if ext == 'csv':
            df = pd.read_csv(input_file)
            content = df.to_string(index=False)

        elif ext in ['xls', 'xlsx']:
            df = pd.read_excel(input_file)
            content = df.to_string(index=False)

        elif ext == 'json':
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                content = json.dumps(data, indent=2)

        elif ext == 'pdf':
            reader = PdfReader(input_file)
            for page in reader.pages:
                content += page.extract_text() or ''

        elif ext == 'txt':
            with open(input_file, 'r', encoding='utf-8') as f:
                content = f.read()

        else:
            raise ValueError(" Unsupported file format: " + ext)

        # Save to .txt
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(content)

        print(f"\n File converted successfully! Saved as: {output_file}")
        print("\n Converted File Content:\n" + "-" * 40)
        
        # Display the content
        print(content)
        print("-" * 40)

    except Exception as e:
        print(f" Error: {e}")

# ⌨️ Get file path from user
file_path = input("Enter the path of your file: ")
convert_any_file_to_txt(file_path)


Enter the path of your file:  Ver1-Ayurveda-Dataset.xlsx



 File converted successfully! Saved as: Ver1-Ayurveda-Dataset.txt

 Converted File Content:
----------------------------------------
 S_ID                                                 AM                         SS                                                              SE    DI1   DI2    DI3                                                                                                                                                                                                                       GS  FI  FI_Calorie  FI_Minerals  FI_Vitamins  FA  GI  GI_Calorie  GI_Minerals  GI_Vitamins  GA  VI  VI_Calorie  VI_Minerals  VI_Vitamins  VA  PI  PI_Calorie  PI_Minerals  PI_Vitamins  PA  SI  SI_Calorie  SI_Minerals  SI_Vitamins  SA  NSI  NSI_Calorie  NSI_Minerals  NSI_Vitamins  NSA  SWA  PRI
M0001                                       Abhayarishta                      Arsha                                            Piles or Hemorrhoids  Pitta  Vata    NaN                       

## Task II : converter content of .txt file to token and save it

In [38]:
import re
import numpy as np
from collections import Counter

# 1. Read and preprocess file
def read_and_preprocess(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip().lower() for line in f if line.strip()]

In [23]:
# Initialize Vocabulary with Words Split into Characters (using BPE approach)
# Create vocabulary from text lines
# 2. Build initial vocabulary
def get_vocab(lines):
    vocab = Counter()
    for line in lines:
        for word in line.split():
            token = ' '.join(list(word)) + ' </w>'
            vocab[token] += 1
    return vocab


In [24]:
# Get symbol pairs
# 3. Get pair statistics
def get_stats(vocab):
    pairs = Counter()
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs


In [34]:
# Regex-based merge function for most frequent words
# 4. Merge the most frequent pair
def merge_pair(pair, vocab):
    pattern = re.escape(' '.join(pair))
    replacement = ''.join(pair)
    merged_vocab = {}
    for word in vocab:
        word_spaced = f" {word} "
        merged_word = re.sub(rf'(?<=\s){pattern}(?=\s)', replacement, word_spaced).strip()
        merged_vocab[merged_word] = vocab[word]
    return merged_vocab


In [44]:
# Run Bpe merge Operations
# Learn BPE merge rules
# 5. Learn BPE merge rules
def learn_bpe(vocab, num_merges):
    merges = []
    for _ in range(num_merges):
        pairs = get_pairs(vocab)  # changed from get_stats to get_pairs
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_pair(best, vocab)
        merges.append(best)
    return merges



In [45]:
# 6. Apply BPE merges to word
def apply_bpe(word, merges):
    word = list(word) + ['</w>']
    i = 0
    while i < len(word) - 1:
        pair = (word[i], word[i+1])
        if pair in merges:
            word[i:i+2] = [''.join(pair)]
            i = max(i - 1, 0)  # backtrack
        else:
            i += 1
    return word

In [46]:
# 7. Tokenize all lines using learned merges
def tokenize_matrix(lines, merges):
    matrix = []
    for line in lines:
        tokenized_line = []
        for word in line.split():
            tokens = apply_bpe(word, merges)
            tokenized_line.extend(tokens)
        matrix.append(tokenized_line)
    return matrix

In [47]:
# 8. Save the matrix to .npy file
def save_matrix(matrix, filename):
    np.save(filename, np.array(matrix, dtype=object))
    print(f"\n Tokenized matrix saved as {filename}.npy")

In [48]:
# 9. Run BPE tokenizer interactively
def run_bpe():
    file_path = input("Enter the path to your .txt file: ").strip()
    try:
        lines = read_and_preprocess(file_path)
    except FileNotFoundError:
        print(" File not found.")
        return

    try:
        num_merges = int(input(" Enter number of BPE merges (e.g. 100): "))
    except ValueError:
        print(" Invalid number.")
        return

    output_name = input(" Enter name to save matrix (without .npy): ").strip()

    vocab = get_vocab(lines)
    merges = learn_bpe(vocab, num_merges)
    matrix = tokenize_matrix(lines, merges)

    print("\n Sample tokenized output:")
    for row in matrix[:5]:
        print(row)

    save_matrix(matrix, output_name)

# 🧠Entry point
if __name__ == "__main__":
    run_bpe()

📄 Enter the path to your .txt file:  Ver1-Ayurveda-Dataset.txt
🔁 Enter number of BPE merges (e.g. 100):  100
💾 Enter name to save matrix (without .npy):  ver1-dataset



🔍 Sample tokenized output:
['s', '_', 'i', 'd', '</w>', 'am', '</w>', 's', 's</w>', 's', 'e</w>', 'di', '1</w>', 'di', '2</w>', 'di', '3</w>', 'g', 's</w>', 'f', 'i</w>', 'f', 'i', '_', 'c', 'al', 'or', 'i', 'e</w>', 'f', 'i', '_', 'm', 'in', 'er', 'al', 's</w>', 'f', 'i', '_', 'v', 'it', 'am', 'in', 's</w>', 'f', 'a</w>', 'g', 'i</w>', 'g', 'i', '_', 'c', 'al', 'or', 'i', 'e</w>', 'g', 'i', '_', 'm', 'in', 'er', 'al', 's</w>', 'g', 'i', '_', 'v', 'it', 'am', 'in', 's</w>', 'ga</w>', 'v', 'i</w>', 'v', 'i', '_', 'c', 'al', 'or', 'i', 'e</w>', 'v', 'i', '_', 'm', 'in', 'er', 'al', 's</w>', 'v', 'i', '_', 'v', 'it', 'am', 'in', 's</w>', 'v', 'a</w>', 'p', 'i</w>', 'p', 'i', '_', 'c', 'al', 'or', 'i', 'e</w>', 'p', 'i', '_', 'm', 'in', 'er', 'al', 's</w>', 'p', 'i', '_', 'v', 'it', 'am', 'in', 's</w>', 'p', 'a</w>', 's', 'i</w>', 's', 'i', '_', 'c', 'al', 'or', 'i', 'e</w>', 's', 'i', '_', 'm', 'in', 'er', 'al', 's</w>', 's', 'i', '_', 'v', 'it', 'am', 'in', 's</w>', 's', 'a</w>', 'n', '