In [15]:
import string
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS

# Load SpaCy model (use 'en_core_web_sm' for English)
# Only use tokenizer, skip parser and NER for performance.
nlp = spacy.load('en_core_web_sm')  # Disable unnecessary components

# Preprocess text: Clean up using regex to remove unwanted tokens.
def clean_text(text):
    # Remove hashtags, mentions, digits, special characters except for spaces
    text = re.sub(r'[@#\d<>]', ' ', text)  # Remove @, #, digits, <>, etc.
    text = re.sub(r'[^\w\s]', '', text)    # Remove all punctuation except space
    print('after clean',text)
    return text

def pre_process_batch(texts):
    # Initialize set to track unique lemmatized words (case-insensitive)
    existed_tokens = set()
    filtered_tokens_list = []

    # Process each text (sentence/document)
    for txt in texts:
        print(f'txt is {txt}')
        # Clean the text before tokenization
        cleaned_txt = clean_text(txt)
        
        # Tokenize the cleaned text using SpaCy's tokenizer directly (skip everything else)
        doc = nlp.make_doc(cleaned_txt)
        
        filtered_tokens = []
        for token in doc:
            print(f"token is {token}")
            lemmatized_word = token.lemma_.lower()

            # Filter out stopwords, digits, punctuation, and ensure uniqueness
            if lemmatized_word not in existed_tokens and \
               token.text.lower() not in STOP_WORDS and \
               token.text.isalpha():  # No need to check isdigit() because we pre-cleaned digits
                print(f"token.text is {token.text}")
                filtered_tokens.append(token.text)
                existed_tokens.add(lemmatized_word)  # Track lemmatized word to ensure uniqueness
        
        filtered_tokens_list.append(filtered_tokens)

    return filtered_tokens_list

# Example: List of 4000 text entries
texts = [
    "If # everything Hello@ everyone . Bye 99 is 34working correctly, < this testing in hospital for patient > bye> EveryOne HhEllo"
   # "Add more sentences here..."
]

# Process the batch of texts
import time
start_time = time.time()
processed_texts = pre_process_batch(texts)
print("Processed texts:", processed_texts)  # Print only the first 5 results for inspection
print(f"Time taken: {time.time() - start_time:.2f} seconds")
print(STOP_WORDS)


txt is If # everything Hello@ everyone . Bye 99 is 34working correctly, < this testing in hospital for patient > bye> EveryOne HhEllo
after clean If   everything Hello  everyone  Bye    is   working correctly   this testing in hospital for patient   bye  EveryOne HhEllo
token is If
token is   
token is everything
token is Hello
token.text is Hello
token is  
token is everyone
token is  
token is Bye
token is    
token is is
token is   
token is working
token is correctly
token is   
token is this
token is testing
token is in
token is hospital
token is for
token is patient
token is   
token is bye
token is  
token is EveryOne
token is HhEllo
Processed texts: [['Hello']]
Time taken: 0.00 seconds
{'should', 'please', 'me', 'serious', 'otherwise', 'over', 'anyone', 'also', '’ve', 'whereupon', 'becomes', 'full', 'five', 'him', '’m', '’d', 'is', '’ll', 'everywhere', '‘m', 'whenever', 'seemed', 'else', 'amount', 'up', 'one', 'anyway', 'more', 'through', 'thereby', 'everything', 'off', 'say', 

In [18]:
import string
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS

# Load SpaCy model (use 'en_core_web_sm' for English)
# Only use tokenizer, skip parser and NER for performance.
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])  # Disable unnecessary components

# Preprocess text: Clean up using regex to remove unwanted tokens.
def clean_text(text):
    # Remove unwanted symbols: @, #, <>, and digits (keeping the spaces between words)
    text = re.sub(r'[@#<>]', ' ', text)  # Remove @, #, <, >
    text = re.sub(r'\d+', ' ', text)     # Remove digits (keep spaces between words)
    return text

def pre_process_batch(texts):
    # Initialize set to track unique lemmatized words (case-insensitive)
    existed_tokens = set()
    filtered_tokens_list = []

    # Process each text (sentence/document)
    for txt in texts:
        # Clean the text before tokenization
        cleaned_txt = clean_text(txt)
        
        # Tokenize the cleaned text using SpaCy's tokenizer directly (skip everything else)
        doc = nlp.make_doc(cleaned_txt)
        
        filtered_tokens = []
        for token in doc:
            lemmatized_word = token.lemma_.lower()

            # Filter out stopwords, ensure uniqueness, and retain only valid tokens
            if lemmatized_word not in existed_tokens and \
               token.text.lower() not in STOP_WORDS and \
               token.text.isalpha():  # Only retain alphabetic tokens
                filtered_tokens.append(token.text)
                existed_tokens.add(lemmatized_word)  # Track lemmatized word to ensure uniqueness
        
        filtered_tokens_list.append(filtered_tokens)

    return filtered_tokens_list

# Example: List of 4000 text entries
texts = [
    "If # everything Hello@ everyone . Bye 99 is 34working correctly, < this testing in hospital for patient > bye> EveryOne",
    # Add more sentences here...
]

# Process the batch of texts
import time
start_time = time.time()
processed_texts = pre_process_batch(texts)
print("Processed texts:", processed_texts[:5])  # Print only the first 5 results for inspection
print(f"Time taken: {time.time() - start_time:.2f} seconds")


Processed texts: [['Hello']]
Time taken: 0.00 seconds
