<a href="https://colab.research.google.com/github/Romainkul/Datathon_KUL/blob/main/Datathon_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import os
import polars as pl
import requests
from zipfile import ZipFile

# URL to the dataset and base directory where the dataset should reside.
url = "https://kuleuven-datathon-2025.s3.eu-central-1.amazonaws.com/Posters.zip"
base_dir = r"C:\Users\romai\Desktop\datathon\Posters"
zip_path = r"C:\Users\romai\Desktop\datathon\Posters.zip"

# Ensure the base directory exists.
os.makedirs(base_dir, exist_ok=True)

# Download the ZIP file only if it doesn't already exist.
if not os.path.exists(zip_path):
    print("Downloading dataset...")
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for a bad status.
    with open(zip_path, "wb") as f:
        f.write(response.content)
    print("Download complete.")
else:
    print("Zip file already exists. Skipping download.")

# Check if the dataset has been extracted by looking for the 'raw' folder.
raw_dir = os.path.join(base_dir, 'raw')
if not os.path.exists(raw_dir):
    print("Extracting dataset...")
    with ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(base_dir)
    print("Extraction complete.")
else:
    print("Dataset already extracted.")

# Define the paths for the processed images and texts.
processed_dir = os.path.join(base_dir, 'preprocessed')
texts_dir = os.path.join(base_dir, 'texts')

# Map each raw subfolder to the languages that are expected to be available.
folder_language_map = {
    'double_page_german_dutch': ['german', 'dutch'],
    'multipart_dutch': ['dutch'],
    'multipart_french': ['french'],
    'single_patch_dutch': ['dutch'],
    'triple_page_german_dutch_french': ['german', 'dutch', 'french'],
}

# List of valid image extensions.
image_extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff')

# Initialize lists for Polars DataFrame
raw_paths, german_processed, dutch_processed, french_processed = [], [], [], []
german_texts, dutch_texts, french_texts = [], [], []

# Loop through each subfolder in the raw directory.
for folder in os.listdir(raw_dir):
    folder_path = os.path.join(raw_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    # Determine which languages are expected for this folder.
    languages = folder_language_map.get(folder, [])

    # Loop through each file in the raw subfolder.
    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(image_extensions):
            continue

        # Full path to the raw image file.
        raw_file_path = os.path.join(folder_path, filename)

        # Prepare paths and texts for available languages
        processed_paths = {lang: os.path.join(processed_dir, lang, filename) if lang in languages else '' 
                           for lang in ['german', 'dutch', 'french']}
        
        texts = {}
        base_name, _ = os.path.splitext(filename)

        for lang in ['german', 'dutch', 'french']:
            text_file_path = os.path.join(texts_dir, lang, base_name + '.jpg.txt')
            if os.path.exists(text_file_path):
                try:
                    with open(text_file_path, 'r', encoding='utf-8') as tf:
                        texts[lang] = tf.read()
                except Exception as e:
                    print(f"Error reading {text_file_path}: {e}")
                    texts[lang] = ''
            else:
                texts[lang] = ''

        # Append data to lists
        raw_paths.append(raw_file_path)
        german_processed.append(processed_paths['german'])
        dutch_processed.append(processed_paths['dutch'])
        french_processed.append(processed_paths['french'])
        german_texts.append(texts['german'])
        dutch_texts.append(texts['dutch'])
        french_texts.append(texts['french'])

# Create a Polars DataFrame
df = pl.DataFrame({
    "raw": raw_paths,
    "german_processed": german_processed,
    "dutch_processed": dutch_processed,
    "french_processed": french_processed,
    "german_text": german_texts,
    "dutch_text": dutch_texts,
    "french_text": french_texts
})

# Preview the DataFrame
print(df.head())

# Optional: Save as a Parquet file for efficient storage
df.write_parquet(os.path.join(base_dir, "posters_data.parquet"))
print("Data saved as Parquet file.")

Zip file already exists. Skipping download.
Dataset already extracted.
shape: (5, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ raw          ┆ german_proc ┆ dutch_proce ┆ french_proc ┆ german_text ┆ dutch_text  ┆ french_text │
│ ---          ┆ essed       ┆ ssed        ┆ essed       ┆ ---         ┆ ---         ┆ ---         │
│ str          ┆ ---         ┆ ---         ┆ ---         ┆ str         ┆ str         ┆ str         │
│              ┆ str         ┆ str         ┆ str         ┆             ┆             ┆             │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ C:\Users\rom ┆ C:\Users\ro ┆ C:\Users\ro ┆             ┆ pt a        ┆ oe : ____   ┆             │
│ ai\Desktop\d ┆ mai\Desktop ┆ mai\Desktop ┆             ┆             ┆ hebben de   ┆             │
│ atatho…      ┆ \datatho…   ┆ \datatho…   ┆             ┆ AM oe       ┆ ec ingen 2… ┆             │
│     

In [None]:
df = pl.read_parquet("posters_data.parquet")

In [None]:
import os
import gc
import torch
import polars as pl
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np

# Disable wandb to avoid potential import issues.
os.environ["WANDB_DISABLED"] = "true"

# Initialize common (lightweight) pipelines.
lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
ner_pipeline = pipeline("token-classification", model="Babelscape/wikineural-multilingual-ner")
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Initialize the multilingual embedding model.
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

expected_language_codes = {"german": "de", "dutch": "nl", "french": "fr"}

def get_sentiment_score(long_text, chunk_size=300):
    """Splits text into chunks and returns average sentiment score."""
    words = long_text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    
    scores = []
    for chunk in chunks:
        try:
            result = sentiment_pipeline(chunk, truncation=True, max_length=512)
            label = result[0]['label']
            scores.append(int(label.split()[0]))
        except Exception as e:
            print(f"Sentiment analysis error: {e}")
    
    return sum(scores) / len(scores) if scores else None

def extract_time_location(text):
    """Extracts time and location entities from text using NER."""
    results = ner_pipeline(text)
    
    times = {ent['word'] for ent in results if 'TIME' in ent.get('entity', '').upper()}
    locations = {ent['word'] for ent in results if 'LOC' in ent.get('entity', '').upper()}

    return " ".join(times), " ".join(locations)

def improve_text(text, prompt_template, imp_pipeline, max_new_tokens=256):
    """Refines OCR text to improve coherence."""
    prompt = prompt_template.format(text)
    try:
        result = imp_pipeline(prompt, max_new_tokens=max_new_tokens)
        return result[0]['generated_text']
    except Exception as e:
        print(f"Text improvement error: {e}")
        return text

def extract_topics(text, topic_template, imp_pipeline, max_new_tokens=128):
    """Extracts main topics from the given text."""
    prompt = topic_template.format(text)
    try:
        result = imp_pipeline(prompt, max_new_tokens=max_new_tokens)
        return result[0]['generated_text']
    except Exception as e:
        print(f"Topic extraction error: {e}")
        return ""

def process_text(idx, lang, df_dict, imp_pipeline, prompt_template, topic_template):
    """Processes text: improvement, topic extraction, sentiment analysis, and embedding computation."""
    text = df_dict[idx][f"{lang}_text"]
    
    if not text.strip():
        return

    print(f"Processing row {idx}, language: {lang}")

    improved_text = improve_text(text, prompt_template, imp_pipeline)
    topics = extract_topics(improved_text, topic_template, imp_pipeline)

    try:
        detected_lang = lang_detector(text[:512])[0]['label'].lower()
    except Exception as e:
        print(f"Language detection error: {e}")
        detected_lang = ""

    expected = expected_language_codes[lang]
    validation_msg = f"text in {lang} is not the right one" if detected_lang != expected else ""

    avg_sentiment = get_sentiment_score(improved_text)
    sentiment_label = f"{avg_sentiment:.2f} stars" if avg_sentiment is not None else ""

    time_extracted, location_extracted = extract_time_location(improved_text)

    try:
        embedding = embedding_model.encode(improved_text).tolist()  # Ensure list format
    except Exception as e:
        print(f"Embedding error: {e}")
        embedding = []

    df_dict[idx][f"improved_{lang}_text"] = improved_text
    df_dict[idx][f"topics_{lang}_text"] = topics
    df_dict[idx][f"{lang}_text_validation"] = validation_msg
    df_dict[idx][f"sentiment_{lang}"] = sentiment_label
    df_dict[idx][f"extracted_time_{lang}"] = time_extracted
    df_dict[idx][f"extracted_location_{lang}"] = location_extracted
    df_dict[idx][f"{lang}_embedding"] = embedding

def process_language(lang, df):
    """Processes all text for a given language using an appropriate model."""
    print(f"Processing language: {lang}")

    model_map = {
        "french": "dbddv01/gpt2-french-small",
        "dutch": "GroNLP/gpt2-small-dutch",
        "german": "dbmdz/german-gpt2"
    }

    if lang not in model_map:
        print(f"Unsupported language: {lang}")
        return

    imp_pipeline = pipeline("text-generation", model=model_map[lang])

    prompts = {
            "german": {
                "improvement": (
                    "Benutzer: Der folgende Text stammt aus einem historischen Plakat der deutschen Regierung in Belgien während des Ersten Weltkriegs (1914–1918). "
                    "Aufgrund von OCR-Verarbeitung kann der Text unvollständig, fehlerhaft oder schwer lesbar sein. "
                    "Bitte korrigieren Sie orthografische Fehler, rekonstruieren Sie unvollständige Sätze und bewahren Sie den historischen und formellen Stil der damaligen Zeit. "
                    "Achten Sie besonders auf:\n"
                    "- Fraktur-Schriftfehler (z. B. „ſ” statt „s“, Buchstabenverwechslungen)\n"
                    "- Offizielle Regierungs- und Militärsprache\n"
                    "- Propagandabegriffe und Befehle\n\n"
                    "Zu verbessernder Text:\n{}\n\n"
                    "Assistent: Hier ist der überarbeitete Text mit besserer Lesbarkeit und historischer Genauigkeit:"
                ),
                "topics": (
                    "Benutzer: Der folgende Text stammt aus einem Plakat der deutschen Regierung in Belgien (1914–1918). "
                    "Es handelt sich wahrscheinlich um eine offizielle Bekanntmachung, ein Gesetz, eine Anordnung oder Propaganda. "
                    "Analysieren Sie den Text und extrahieren Sie die wichtigsten Themen, wobei Sie darauf achten, OCR-Fehler zu berücksichtigen und zu korrigieren.\n"
                    "Falls es sich um ein Regierungsdokument handelt, könnten relevante Themen sein:\n"
                    "- Kriegsrecht und Besatzungspolitik\n"
                    "- Rekrutierung und Wehrpflicht\n"
                    "- Versorgungsengpässe und Rationierung\n"
                    "- Zensur oder öffentliche Sicherheit\n\n"
                    "Text:\n{}\n\n"
                    "Assistent: Die Hauptthemen dieses Dokuments sind:"
                )
            },
            "dutch": {
                "improvement": (
                    "Gebruiker: De volgende tekst is afkomstig van een Duitse regeringsposter uit de Eerste Wereldoorlog (1914–1918), die in België werd verspreid. "
                    "De tekst kan OCR-fouten bevatten en onvolledig zijn. Gelieve de tekst te corrigeren en herstellen, terwijl de oorspronkelijke historische toon en formele stijl behouden blijven. "
                    "Let vooral op:\n"
                    "- Fouten veroorzaakt door gotisch schrift (Fraktur), zoals verwisselde letters\n"
                    "- Strikte taal die past bij officiële bevelen of propagandateksten\n"
                    "- Militaire en administratieve terminologie\n\n"
                    "Te verbeteren tekst:\n{}\n\n"
                    "Assistent: Hier is de gecorrigeerde en historisch accurate versie:"
                ),
                "topics": (
                    "Gebruiker: De volgende tekst komt uit een Duitse regeringsaankondiging in België tijdens de Eerste Wereldoorlog (1914–1918). "
                    "Dit document kan een officiële order, een wet, propaganda of een militaire verordening zijn. "
                    "Haal de kernonderwerpen uit deze tekst en corrigeer OCR-gerelateerde fouten waar nodig.\n"
                    "Mogelijke relevante thema’s kunnen zijn:\n"
                    "- Bezettingspolitiek en militaire bevelen\n"
                    "- Propaganda en informatiecontrole\n"
                    "- Distributie en rantsoenering van goederen\n"
                    "- Arbeidsinzet en verplichte tewerkstelling\n\n"
                    "Tekst:\n{}\n\n"
                    "Assistent: De belangrijkste thema’s in deze tekst zijn:"
                )
            },
            "french": {
                "improvement": (
                    "Utilisateur: Ce texte provient d’une affiche officielle du gouvernement allemand en Belgique pendant la Première Guerre mondiale (1914–1918). "
                    "Comme le texte a été extrait par OCR, il peut contenir des erreurs de reconnaissance et des phrases incomplètes. "
                    "Corrigez et reformulez le texte en respectant son ton officiel et son contexte historique.\n"
                    "Faites particulièrement attention à :\n"
                    "- Corrections des erreurs de reconnaissance dues aux caractères gothiques (Fraktur)\n"
                    "- Langage administratif et militaire\n"
                    "- Tonalité propagandiste ou formelle\n\n"
                    "Texte à améliorer :\n{}\n\n"
                    "Assistant: Voici le texte révisé avec une meilleure lisibilité et fidélité historique :"
                ),
                "topics": (
                    "Utilisateur: Ce document est une affiche du gouvernement allemand en Belgique (1914–1918). "
                    "Il pourrait contenir des annonces officielles, des ordres militaires ou des éléments de propagande. "
                    "Identifiez les thèmes principaux, tout en corrigeant d’éventuelles erreurs OCR.\n"
                    "Thèmes possibles :\n"
                    "- Ordres militaires et lois d’occupation\n"
                    "- Contrôle des ressources et rationnement\n"
                    "- Obligations de travail ou de service\n"
                    "- Censure et propagande de guerre\n\n"
                    "Texte :\n{}\n\n"
                    "Assistant: Les principaux thèmes abordés sont :"
                )
            }
        }

    prompt_template, topic_template = prompts[lang]

    df = df.with_columns(pl.col(f"{lang}_text").cast(pl.Utf8).fill_null(""))
    df = df.with_columns(pl.arange(0, df.height).alias("index"))

    if f"{lang}_embedding" not in df.columns:
        df = df.with_columns(pl.Series(name=f"{lang}_embedding", values=[None] * df.height))

    indices = df.filter((pl.col(f"{lang}_text") != "")).get_column("index").to_list()

    if not indices:
        print(f"No texts to process for {lang}.")
        return

    df_dict = df.to_dicts()

    for idx in indices:
        process_text(idx, lang, df_dict, imp_pipeline, prompt_template, topic_template)

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    df = pl.DataFrame(df_dict)  # ✅ Convert back to Polars DataFrame

    del imp_pipeline
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print(f"Finished processing language: {lang}")

    return df

# Initialize new columns in the DataFrame.
for lang in ['german', 'dutch', 'french']:
    df = df.with_columns([
        pl.lit("").alias(f"{lang}_text_validation"),
        pl.lit("").alias(f"improved_{lang}_text"),
        pl.lit("").alias(f"sentiment_{lang}"),
        pl.lit("").alias(f"extracted_time_{lang}"),
        pl.lit("").alias(f"extracted_location_{lang}"),
        pl.lit("").alias(f"topics_{lang}_text"),
        pl.Series(name=f"{lang}_embedding", values=[None] * df.height).cast(pl.Object)
    ])

for language in ['french', 'dutch', 'german']:
    df = process_language(language, df)  # ✅ Update DF after each language
    df.write_parquet("preprocessed_df.parquet")


Processing language: french
Processing row 82, language: french
Processing row 83, language: french
Processing row 84, language: french
Processing row 85, language: french
Processing row 86, language: french
Processing row 87, language: french
Processing row 88, language: french
Processing row 89, language: french
Processing row 90, language: french
Processing row 91, language: french
Processing row 92, language: french
Processing row 93, language: french
Processing row 94, language: french
Processing row 95, language: french
Processing row 96, language: french
Processing row 97, language: french
Processing row 98, language: french
Processing row 99, language: french
Processing row 100, language: french
Processing row 101, language: french
Processing row 102, language: french
Processing row 103, language: french
Processing row 104, language: french
Processing row 105, language: french
Processing row 106, language: french
Processing row 107, language: french
Processing row 108, language

In [None]:
df = pl.read_parquet("preprocessed_df.parquet")

In [None]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

# Store extracted embeddings
embedding_data = []

languages = ['french', 'dutch', 'german']

for lang in languages:
    emb_col = f"{lang}_embedding"
    
    # Efficiently extract embeddings using iter_rows()
    for row in df.iter_rows(named=True):
        emb = row[emb_col]
        if emb is not None:  # Ensure we have an embedding
            embedding_data.append({
                "embedding": np.array(emb),  # Convert to NumPy array
                "language": lang
            })

# Convert to Polars DataFrame
emb_df = pl.DataFrame(embedding_data)

# Convert embeddings into NumPy array for t-SNE
X = np.vstack(emb_df["embedding"].to_list())  # Extract NumPy arrays from Polars

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# Add t-SNE dimensions back to Polars DataFrame
emb_df = emb_df.with_columns([
    pl.Series("tsne_1", X_tsne[:, 0]),
    pl.Series("tsne_2", X_tsne[:, 1])
])

# Convert to Pandas for visualization
emb_df_pandas = emb_df.to_pandas()

# Plot using Seaborn
plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=emb_df_pandas,
    x="tsne_1",
    y="tsne_2",
    hue="language",
    palette="Set2",
    s=80,
    alpha=0.8
)
plt.title("2D Visualization of Embeddings by Language")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend(title="Language")
plt.show()

In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Function to extract a numeric sentiment from a label like "4.20 stars"
def extract_sentiment_numeric(s):
    m = re.search(r'(\d+(\.\d+)?)', s)
    if m:
        return float(m.group(1))
    else:
        return None

# Create numeric sentiment columns for each language (we use French as our main example here)
df['sentiment_french_numeric'] = df['sentiment_french'].apply(extract_sentiment_numeric)
df['sentiment_dutch_numeric'] = df['sentiment_dutch'].apply(extract_sentiment_numeric)
df['sentiment_german_numeric'] = df['sentiment_german'].apply(extract_sentiment_numeric)

### 1. Analyze Sentiment by Extracted Time (French)
# Group by the extracted_time_french column and compute the average sentiment.
# (Note: extracted_time_french is a string; in a real scenario, you might need to parse dates.)
time_group = df.groupby('extracted_time_french')['sentiment_french_numeric'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=time_group, x='extracted_time_french', y='sentiment_french_numeric', palette="Blues_d")
plt.title("Average French Sentiment by Extracted Time")
plt.xlabel("Extracted Time")
plt.ylabel("Average Sentiment (stars)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2. Analyze Sentiment by Extracted Location (French)
loc_group = df.groupby('extracted_location_french')['sentiment_french_numeric'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=loc_group, x='extracted_location_french', y='sentiment_french_numeric', palette="Greens_d")
plt.title("Average French Sentiment by Extracted Location")
plt.xlabel("Extracted Location")
plt.ylabel("Average Sentiment (stars)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 3. Analyze Topics and Their Relation to Sentiment (French)
# Assume that topics_french_text is a comma-separated string of topics.
topic_list = []
for topics in df['topics_french_text']:
    if topics and isinstance(topics, str):
        for topic in topics.split(','):
            topic = topic.strip()
            if topic:
                topic_list.append(topic)

# Count frequency of each topic.
topic_counts = Counter(topic_list)
topic_df = pd.DataFrame(topic_counts.items(), columns=['topic', 'count'])
# Take top 10 topics by frequency.
top_topics = topic_df.sort_values(by='count', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_topics, x='topic', y='count', palette="Oranges_d")
plt.title("Top 10 French Topics by Frequency")
plt.xlabel("Topic")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Now, compute average sentiment per topic.
topic_sentiments = {}
for idx, row in df.iterrows():
    sentiment = extract_sentiment_numeric(row['sentiment_french'])
    topics = row['topics_french_text']
    if topics and sentiment is not None:
        for topic in topics.split(','):
            topic = topic.strip()
            if topic:
                topic_sentiments.setdefault(topic, []).append(sentiment)

# Compute the average sentiment for each topic.
topic_avg_sentiment = {topic: sum(vals)/len(vals) for topic, vals in topic_sentiments.items()}
topic_sentiment_df = pd.DataFrame(topic_avg_sentiment.items(), columns=['topic', 'avg_sentiment'])

# Merge with frequency to filter for top topics by frequency.
topic_sentiment_df = topic_sentiment_df.merge(topic_df, on='topic')
# Filter to top 10 topics (by frequency)
top_topic_sentiment = topic_sentiment_df.sort_values(by='count', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_topic_sentiment, x='topic', y='avg_sentiment', palette="Purples_d")
plt.title("Average French Sentiment by Top 10 Topics")
plt.xlabel("Topic")
plt.ylabel("Average Sentiment (stars)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
%pip install wordcloud

In [None]:
%python -m spacy download fr_core_news_sm
%python -m spacy download nl_core_news_sm
%python -m spacy download de_core_news_sm

In [None]:
import spacy
import re
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Load spaCy models for French, Dutch, and German.
nlp_fr = spacy.load("fr_core_news_sm")
nlp_nl = spacy.load("nl_core_news_sm")
nlp_de = spacy.load("de_core_news_sm")

def preprocess_text(text, language):
    """
    Preprocess the text using spaCy:
      - Lowercase
      - Remove punctuation
      - Remove stopwords
      - Lemmatize tokens
    """
    # Select the appropriate spaCy model
    if language == "french":
        nlp = nlp_fr
    elif language == "dutch":
        nlp = nlp_nl
    elif language == "german":
        nlp = nlp_de
    else:
        return text.lower()  # fallback

    # Process the text with spaCy.
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        # Use the lemma and ensure it's lowercase and stripped of extra whitespace.
        lemma = token.lemma_.lower().strip()
        if lemma:
            tokens.append(lemma)
    return " ".join(tokens)

languages = ['french', 'dutch', 'german']

for lang in languages:
    # Combine all improved texts for the current language.
    text_data = " ".join(df[f"improved_{lang}_text"].dropna().tolist())

    # Preprocess the text thoroughly.
    processed_text = preprocess_text(text_data, lang)

    # Generate the word cloud.
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color="white",
        collocations=False,
        stopwords=STOPWORDS  # additional stopwords can be added if needed
    ).generate(processed_text)

    # Plot the word cloud.
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"WordCloud for {lang.capitalize()}")
    plt.show()