In [1]:
import sys
sys.executable

'C:\\Users\\masab\\anaconda3\\envs\\sarvamai\\python.exe'

In [4]:
# !pip install fasttext-wheel
import fasttext

In [13]:
# help(fasttext.FastText)

In [20]:
print(fasttext.FastText.tokenize("कृपया प्रतीक्षा करें"))

['कृपया', 'प्रतीक्षा', 'करें']
<class 'fasttext_pybind.loss_name'>


In [22]:
import pandas as pd

In [23]:
# preprocessing the content column of the pandas as follows :
# Remove non hindi text from the text
# Remove numericals 
# Remove extraspaces and "\n" "words enclosed in ()", DoubleQuotes, "special characters"
# Merge the paragraphs (or) points of an article to 1 paragraph

In [24]:
def preprocess_hindi_text(text):
    """Preprocess text to keep only Hindi, remove unwanted elements, and merge into one paragraph"""
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Step 1: Split text into sentences/segments (handling newlines and points)
    segments = text.replace('\n', ' ').split(' ')
    
    # Step 2: Keep only Hindi text
    hindi_text = []
    for segment in segments:
        if segment.strip():
            try:
                if detect(segment) == 'hi':
                    hindi_text.append(segment)
            except:
                continue
    
    # Rejoin segments into a single string
    text = ' '.join(hindi_text)
    
    # Step 3: Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Step 4: Remove text in parentheses
    text = re.sub(r'\([^()]*\)', '', text)
    
    # Step 5: Remove double quotes
    text = text.replace('"', '')
    
    # Step 6: Remove special characters (keep Hindi characters and spaces)
    # Hindi Unicode range: \u0900-\u097F
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    
    # Step 7: Remove extra spaces and normalize
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def preprocess_dataframe(csv_file, column_name='full_text'):
    """Apply preprocessing to a specific column in a pandas DataFrame"""
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Apply preprocessing to the specified column
    df[column_name] = df[column_name].apply(preprocess_hindi_text)
    
    # Save the preprocessed DataFrame back to CSV (optional)
    output_file = csv_file.replace('.csv', '_preprocessed.csv')
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    return df

In [25]:
# parallelizing the code with cuda libraries
# Since CUDA is not supporting langdetect module
# detect language in cpu and offload the rest of the processing to gpu

In [None]:
import pandas as pd
import numpy as np
from numba import cuda
import re
from langdetect import detect

@cuda.jit
def kernel_process(text_array, output_array, text_lengths):
    idx = cuda.grid(1)  # Get the thread index
    if idx < text_array.shape[0]:  # Check bounds
        # Convert input text to string (working with bytes)
        text = ''
        for i in range(text_lengths[idx]):
            text += chr(text_array[idx, i])  # Convert byte to char
        
        # Step 1: Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Step 2: Remove text in parentheses
        text = re.sub(r'\([^()]*\)', '', text)
        
        # Step 3: Remove double quotes
        text = text.replace('"', '')
        
        # Step 4: Keep only Hindi characters and spaces (Unicode range \u0900-\u097F)
        text = ''.join(c for c in text if (0x0900 <= ord(c) <= 0x097F) or c.isspace())
        
        # Step 5: Normalize spaces
        text = ' '.join(text.split())
        
        # Write result to output array
        for i, c in enumerate(text):
            if i < out_array.shape[1]:  # Ensure we don't exceed output buffer
                out_array[idx, i] = ord(c)
    

In [None]:
def preprocess_hindi_text_gpu(texts):
    """Preprocess an array of texts on GPU"""
    if not texts:
        return []
    
    # Step 1: Filter Hindi text on CPU (langdetect isn't GPU-friendly)
    hindi_texts = []
    for text in texts:
        if not isinstance(text, str) or not text.strip():
            hindi_texts.append("")
            continue
        segments = text.replace('\n', ' ').split(' ')
        hindi_segments = [seg for seg in segments if seg.strip() and detect(seg) == 'hi']
        hindi_texts.append(' '.join(hindi_segments))
    
    # Step 2: Prepare data for GPU
    max_len = max(len(t) for t in hindi_texts) + 1  # Add 1 for safety
    text_array = np.zeros((len(hindi_texts), max_len), dtype=np.uint8)
    for i, text in enumerate(hindi_texts):
        for j, char in enumerate(text):
            text_array[i, j] = ord(char)
    
    # Copy data to GPU
    d_text_array = cuda.to_device(text_array)
    d_out_array = cuda.device_array((len(hindi_texts), max_len), dtype=np.uint8)
    text_lengths = np.array([len(t) for t in hindi_texts], dtype=np.int32)
    d_text_lengths = cuda.to_device(text_lengths)
    
    # Configure CUDA grid and blocks
    threads_per_block = 256
    blocks_per_grid = (len(hindi_texts) + threads_per_block - 1) // threads_per_block
    
    # Launch kernel
    preprocess_gpu_kernel[blocks_per_grid, threads_per_block](d_text_array, d_out_array, d_text_lengths)
    
    # Copy results back to CPU
    out_array = d_out_array.copy_to_host()
    result = [''.join(chr(c) for c in row if c != 0) for row in out_array]
    
    return result

In [None]:
def preprocess_dataframe_gpu(csv_file, column_name='full_text'):
    """Apply GPU-accelerated preprocessing to a DataFrame column"""
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Extract texts as a list
    texts = df[column_name].tolist()
    
    # Process texts on GPU
    preprocessed_texts = preprocess_hindi_text_gpu(texts)
    
    # Update DataFrame
    df[column_name] = preprocessed_texts
    
    # Save to CSV
    output_file = csv_file.replace('.csv', '_preprocessed_gpu.csv')
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    return df