In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords and punkt data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def remove_stopwords(text, language):
    """
    Removes stopwords from the input text for the specified language.
    
    Args:
        text (str): The input text.
        language (str): The language of the text (e.g., 'english', 'spanish').
    
    Returns:
        str: The cleaned text with stopwords removed.
    """
    # Tokenize the input text into words
    words = word_tokenize(text)
    
    # Get the list of stopwords for the specified language
    stop_words = set(stopwords.words(language))
    
    # Remove stopwords from the list of words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the filtered words to form the cleaned text
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

def add_stopwords(language, custom_stopwords):
    """
    Adds custom stopwords to the existing list of stopwords for the specified language.
    
    Args:
        language (str): The language for which to add stopwords (e.g., 'english', 'spanish').
        custom_stopwords (list): A list of custom stopwords to add.
    
    Returns:
        set: The updated set of stopwords.
    """
    # Get the existing stopwords for the specified language
    stop_words = set(stopwords.words(language))
    
    # Add custom stopwords
    stop_words.update(custom_stopwords)
    
    return stop_words

In [3]:
# Example text in different languages
english_text = "This is an example sentence in English."
spanish_text = "Este es un ejemplo de frase en español."
french_text = "Ceci est un exemple de phrase en français."
german_text = "Dies ist ein Beispiel für einen Satz in Deutsch."
italian_text = "Questo è un esempio di frase in italiano."

# Remove stopwords from the example texts
english_result = remove_stopwords(english_text, 'english')
spanish_result = remove_stopwords(spanish_text, 'spanish')
french_result = remove_stopwords(french_text, 'french')
german_result = remove_stopwords(german_text, 'german')
italian_result = remove_stopwords(italian_text, 'italian')

# Print results
print("English:", english_result)
print("Spanish:", spanish_result)
print("French:", french_result)
print("German:", german_result)
print("Italian:", italian_result)

# Example of adding custom stopwords
custom_stopwords = ['example', 'frase', 'phrase', 'satz', 'esempio']
updated_english_stopwords = add_stopwords('english', custom_stopwords)
updated_spanish_stopwords = add_stopwords('spanish', custom_stopwords)
updated_french_stopwords = add_stopwords('french', custom_stopwords)
updated_german_stopwords = add_stopwords('german', custom_stopwords)
updated_italian_stopwords = add_stopwords('italian', custom_stopwords)

# Remove stopwords with updated stopwords list
english_result_updated = remove_stopwords(english_text, 'english')
spanish_result_updated = remove_stopwords(spanish_text, 'spanish')
french_result_updated = remove_stopwords(french_text, 'french')
german_result_updated = remove_stopwords(german_text, 'german')
italian_result_updated = remove_stopwords(italian_text, 'italian')

# Print updated results
print("\nAfter adding custom stopwords:")
print("English:", english_result_updated)
print("Spanish:", spanish_result_updated)
print("French:", french_result_updated)
print("German:", german_result_updated)
print("Italian:", italian_result_updated)

English: example sentence English .
Spanish: ejemplo frase español .
French: Ceci exemple phrase français .
German: Beispiel Satz Deutsch .
Italian: esempio frase italiano .

After adding custom stopwords:
English: example sentence English .
Spanish: ejemplo frase español .
French: Ceci exemple phrase français .
German: Beispiel Satz Deutsch .
Italian: esempio frase italiano .


In [9]:
from nltk.tokenize import word_tokenize
manual_stopwords = {
    'english': {'this', 'is', 'an', 'in', 'the', 'a' , '.'},
    'spanish': {'este', 'es', 'un', 'de', 'en', 'la'},
    'french': {'ceci', 'est', 'un', 'de', 'en', 'la'},
    'german': {'dies', 'ist', 'ein', 'für', 'in', 'der'},
    'italian': {'questo', 'è', 'un', 'di', 'in', 'la'}
}

def remove_stopwords(text, language):
    words = word_tokenize(text)
    stop_words = manual_stopwords.get(language, set())
    filtered_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

def add_stopwords(language, custom_stopwords):
    stop_words = manual_stopwords.get(language, set())
    stop_words.update(custom_stopwords)
    manual_stopwords[language] = stop_words
    return stop_words

english_text = "This is an example sentence in English."
spanish_text = "Este es un ejemplo de frase en español."
french_text = "Ceci est un exemple de phrase en français."
german_text = "Dies ist ein Beispiel für einen Satz in Deutsch."
italian_text = "Questo è un esempio di frase in italiano."

print(f"Actual texts : \n1. ENGLISH : {english_text}\n2. SPANISH : {spanish_text}\n3. FRENCH : {french_text}\n4. GERMAN : {german_text}\n5. ITALIAN : {italian_text}\n")

# Remove stopwords from the example 
print("Removing stopwords from actual texts :\n")
english_result = remove_stopwords(english_text, 'english')
spanish_result = remove_stopwords(spanish_text, 'spanish')
french_result = remove_stopwords(french_text, 'french')
german_result = remove_stopwords(german_text, 'german')
italian_result = remove_stopwords(italian_text, 'italian')

print("English:", english_result)
print("Spanish:", spanish_result)
print("French:", french_result)
print("German:", german_result)
print("Italian:", italian_result)

# adding custom stopwords
custom_stopwords = ['example', 'frase', 'phrase', 'satz', 'esempio' , '.']

add_stopwords('english', custom_stopwords)
add_stopwords('spanish', custom_stopwords)
add_stopwords('french', custom_stopwords)
add_stopwords('german', custom_stopwords)
add_stopwords('italian', custom_stopwords)

english_result_updated = remove_stopwords(english_text, 'english')
spanish_result_updated = remove_stopwords(spanish_text, 'spanish')
french_result_updated = remove_stopwords(french_text, 'french')
german_result_updated = remove_stopwords(german_text, 'german')
italian_result_updated = remove_stopwords(italian_text, 'italian')

print("\nAfter adding custom stopwords:\n")
print("English:", english_result_updated)
print("Spanish:", spanish_result_updated)
print("French:", french_result_updated)
print("German:", german_result_updated)
print("Italian:", italian_result_updated)

Actual texts : 
1. ENGLISH : This is an example sentence in English.
2. SPANISH : Este es un ejemplo de frase en español.
3. FRENCH : Ceci est un exemple de phrase en français.
4. GERMAN : Dies ist ein Beispiel für einen Satz in Deutsch.
5. ITALIAN : Questo è un esempio di frase in italiano.

Removing stopwords from actual texts :

English: example sentence English
Spanish: ejemplo frase español .
French: exemple phrase français .
German: Beispiel einen Satz Deutsch .
Italian: esempio frase italiano .

After adding custom stopwords:

English: sentence English
Spanish: ejemplo español
French: exemple français
German: Beispiel einen Deutsch
Italian: italiano
