<a href="https://colab.research.google.com/github/Romainkul/Datathon_KUL/blob/main/Datathon_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import requests
from zipfile import ZipFile

# URL to the dataset and base directory where the dataset should reside.
url = "https://kuleuven-datathon-2025.s3.eu-central-1.amazonaws.com/Posters.zip"
base_dir = "/Posters"
zip_path = "/Posters.zip"

# Ensure the base directory exists.
os.makedirs(base_dir, exist_ok=True)

# Download the ZIP file only if it doesn't already exist.
if not os.path.exists(zip_path):
    print("Downloading dataset...")
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for a bad status.
    with open(zip_path, "wb") as f:
        f.write(response.content)
    print("Download complete.")
else:
    print("Zip file already exists. Skipping download.")

# Check if the dataset has been extracted by looking for the 'raw' folder.
raw_dir = os.path.join(base_dir, 'raw')
if not os.path.exists(raw_dir):
    print("Extracting dataset...")
    with ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(base_dir)
    print("Extraction complete.")
else:
    print("Dataset already extracted.")

# Define the paths for the processed images and texts.
processed_dir = os.path.join(base_dir, 'preprocessed')
texts_dir = os.path.join(base_dir, 'texts')

# Map each raw subfolder to the languages that are expected to be available.
folder_language_map = {
    'double_page_german_dutch': ['german', 'dutch'],
    'multipart_dutch': ['dutch'],
    'multipart_french': ['french'],
    'single_patch_dutch': ['dutch'],
    'triple_page_german_dutch_french': ['german', 'dutch', 'french'],
}

# List of valid image extensions.
image_extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff')

data_records = []

# Loop through each subfolder in the raw directory.
for folder in os.listdir(raw_dir):
    folder_path = os.path.join(raw_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    # Determine which languages are expected for this folder.
    languages = folder_language_map.get(folder, [])

    # Loop through each file in the raw subfolder.
    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(image_extensions):
            continue

        # Full path to the raw image file.
        raw_file_path = os.path.join(folder_path, filename)

        # Prepare dictionaries to store processed file paths and text contents.
        processed_paths = {}
        texts = {}
        for lang in ['german', 'dutch', 'french']:
            if lang in languages:
                # Build the processed file path (assuming same filename).
                processed_file_path = os.path.join(processed_dir, lang, filename)
                processed_paths[lang] = processed_file_path

                # Build the corresponding text file path (assuming .txt files with the same base name).
                base_name, _ = os.path.splitext(filename)
                text_file_path = os.path.join(texts_dir, lang, base_name + '.jpg.txt')
                if os.path.exists(text_file_path):
                    try:
                        with open(text_file_path, 'r', encoding='utf-8') as tf:
                            texts[lang] = tf.read()
                    except Exception as e:
                        print(f"Error reading {text_file_path}: {e}")
                        texts[lang] = ''
                else:
                    texts[lang] = ''
            else:
                # For languages not present in this raw file, use empty strings.
                processed_paths[lang] = ''
                texts[lang] = ''

        # Create a record (dictionary) for this raw file.
        record = {
            'raw': raw_file_path,
            'german_processed': processed_paths['german'],
            'dutch_processed': processed_paths['dutch'],
            'french_processed': processed_paths['french'],
            'german_text': texts['german'],
            'dutch_text': texts['dutch'],
            'french_text': texts['french'],
        }
        data_records.append(record)

# Create the DataFrame from the collected records.
df = pd.DataFrame(data_records)

# Preview the DataFrame.
#print(df.head())


Downloading dataset...
Download complete.
Extracting dataset...
Extraction complete.


In [5]:
df = pd.DataFrame(data_records)

In [6]:
df=df.head(5)

In [3]:
#df = df[df["french_text"].str.strip() != ''].head(5)

In [None]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize pipelines.
lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
ner_pipeline = pipeline("token-classification", model="Babelscape/wikineural-multilingual-ner")

improve_pipeline_fr = pipeline("text-generation", model="croissantllm/CroissantLLMChat-v0.1")
improve_pipeline_nl = pipeline("text-generation", model="BramVanroy/fietje-2")
improve_pipeline_de = pipeline("text-generation", model="TheBloke/KafkaLM-70B-German-V0.1-GGUF")

sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Initialize a multilingual embedding model.
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

expected_language_codes = {
    "german": "de",
    "dutch": "nl",
    "french": "fr"
}

def improve_text(text, language):
    """
    Improve text using a language-specific generation model.
    """
    if language == "french" and text.strip():
        prompt = (
            "Améliorez le texte OCR suivant pour le rendre cohérent et continu, "
            "tout en préservant son sens original : " + text
        )
        try:
            result = improve_pipeline_fr(prompt)
            improved = result[0]['generated_text']
            return improved
        except Exception as e:
            print(f"Error improving French text: {e}")
            return text
    elif language == "dutch" and text.strip():
        prompt = (
            "Verbeter de volgende OCR-tekst zodat deze coherent en doorlopend is, "
            "terwijl de oorspronkelijke betekenis behouden blijft : " + text
        )
        try:
            result = improve_pipeline_nl(prompt)
            improved = result[0]['generated_text']
            return improved
        except Exception as e:
            print(f"Error improving Dutch text: {e}")
            return text
    elif language == "german" and text.strip():
        prompt = (
            "Verbessern Sie den folgenden OCR-Text, um ihn kohärent und fließend zu machen, "
            "während der ursprüngliche Sinn erhalten bleibt: " + text
        )
        try:
            result = improve_pipeline_de(prompt)
            improved = result[0]['generated_text']
            return improved
        except Exception as e:
            print(f"Error improving German text: {e}")
            return text
    else:
        return text

def extract_time_location(text):
    """
    Run NER on the text and extract entities labeled as TIME/DATE and LOC/GPE.
    """
    ner_results = ner_pipeline(text)
    times = []
    locations = []
    # Debug prints can be commented out.
    print(ner_results)
    for entity in ner_results:
        if entity.get('entity_group') in ['TIME', 'DATE']:
            times.append(entity['word'])
        if entity.get('entity_group') in ['LOC', 'GPE']:
            locations.append(entity['word'])
    return " ".join(set(times)), " ".join(set(locations))

def get_sentiment_score(long_text, chunk_size=300):
    """
    Splits the long_text into chunks (by word count), runs sentiment analysis on each chunk,
    extracts the star rating (e.g. "4 stars"), and returns the average score.
    """
    words = long_text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

    scores = []
    for chunk in chunks:
        try:
            result = sentiment_pipeline(chunk, truncation=True, max_length=512)
            label = result[0]['label']
            star_value = int(label.split()[0])
            scores.append(star_value)
        except Exception as e:
            print("Error during sentiment analysis on chunk:", e)

    if scores:
        return sum(scores) / len(scores)
    else:
        return None

def extract_topics(text, language):
    """
    Uses a text-generation prompt to extract the main topics from the text.
    The prompt is language-specific.
    """
    if not text.strip():
        return ""
    if language == "french":
        prompt = "Extrayez les principaux sujets abordés dans le texte suivant : " + text
        try:
            result = improve_pipeline_fr(prompt)
            topics = result[0]['generated_text']
            return topics
        except Exception as e:
            print(f"Error extracting topics for French text: {e}")
            return ""
    elif language == "dutch":
        prompt = "Extraheer de belangrijkste onderwerpen uit de volgende tekst: " + text
        try:
            result = improve_pipeline_nl(prompt)
            topics = result[0]['generated_text']
            return topics
        except Exception as e:
            print(f"Error extracting topics for Dutch text: {e}")
            return ""
    elif language == "german":
        prompt = "Extrahiere die Hauptthemen aus dem folgenden Text: " + text
        try:
            result = improve_pipeline_de(prompt)
            topics = result[0]['generated_text']
            return topics
        except Exception as e:
            print(f"Error extracting topics for German text: {e}")
            return ""
    else:
        return ""

# Initialize new columns in the DataFrame.
for lang in ['german', 'dutch', 'french']:
    df[f"{lang}_text_validation"] = ""
    df[f"improved_{lang}_text"] = ""
    df[f"sentiment_{lang}"] = ""
    df[f"extracted_time_{lang}"] = ""
    df[f"extracted_location_{lang}"] = ""
    df[f"topics_{lang}_text"] = ""
    # New column to store embeddings.
    df[f"{lang}_embedding"] = None

# Process each row in the DataFrame.
for idx, row in df.iterrows():
    for lang in ['german', 'dutch', 'french']:
        text = row[f"{lang}_text"]
        if text and text.strip():
            try:
                detection = lang_detector(text[:512])
                detected_lang = detection[0]['label'].lower()
            except Exception as e:
                print(f"Error during language detection for row {idx}, {lang}: {e}")
                detected_lang = ""
            expected = expected_language_codes[lang]
            if detected_lang != expected:
                df.at[idx, f"{lang}_text_validation"] = f"text in {lang} is not the right one"
                df.at[idx, f"improved_{lang}_text"] = text
                try:
                    sentiment_result = sentiment_pipeline(text, truncation=True, max_length=512)
                    sentiment_label = sentiment_result[0]['label']
                except Exception as e:
                    print(f"Error during sentiment analysis for row {idx}, {lang}: {e}")
                    sentiment_label = ""
                df.at[idx, f"sentiment_{lang}"] = sentiment_label
            else:
                df.at[idx, f"{lang}_text_validation"] = ""
                improved = improve_text(text, lang)
                df.at[idx, f"improved_{lang}_text"] = improved

                try:
                    avg_sentiment = get_sentiment_score(improved)
                    sentiment_label = f"{avg_sentiment:.2f} stars" if avg_sentiment is not None else ""
                except Exception as e:
                    print(f"Error during sentiment analysis for row {idx}, {lang}: {e}")
                    sentiment_label = ""
                df.at[idx, f"sentiment_{lang}"] = sentiment_label

                # Extract topics.
                topics = extract_topics(improved, lang)
                df.at[idx, f"topics_{lang}_text"] = topics

            # Apply NER extraction for this language.
            improved_text = df.at[idx, f"improved_{lang}_text"]
            if improved_text.strip():
                time_extracted, location_extracted = extract_time_location(improved_text)
                df.at[idx, f"extracted_time_{lang}"] = time_extracted
                df.at[idx, f"extracted_location_{lang}"] = location_extracted

            # --- Embedding Extraction ---
            # Compute a multilingual embedding for the improved text.
            if improved_text.strip():
                try:
                    embedding = embedding_model.encode(improved_text)
                    df.at[idx, f"{lang}_embedding"] = embedding
                except Exception as e:
                    print(f"Error computing embedding for row {idx}, {lang}: {e}")
                    df.at[idx, f"{lang}_embedding"] = None

print("\nDataFrame after preprocessing enhancements (including topic extraction, NER, and embeddings for all languages):")
print(df.head())

model.safetensors:  30%|###       | 336M/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

In [None]:
embedding_data = []
languages = ['french', 'dutch', 'german']

for lang in languages:
    emb_col = f"{lang}_embedding"
    for idx, row in df.iterrows():
        emb = row[emb_col]
        if emb is not None:  # Ensure we have an embedding.
            embedding_data.append({
                "embedding": np.array(emb),
                "language": lang
            })

emb_df = pd.DataFrame(embedding_data)

X = np.vstack(emb_df['embedding'].values)

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

emb_df['tsne_1'] = X_tsne[:, 0]
emb_df['tsne_2'] = X_tsne[:, 1]

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=emb_df,
    x='tsne_1',
    y='tsne_2',
    hue='language',
    palette='Set2',
    s=80,
    alpha=0.8
)
plt.title("2D Visualization of Embeddings by Language")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend(title="Language")
plt.show()


In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Function to extract a numeric sentiment from a label like "4.20 stars"
def extract_sentiment_numeric(s):
    m = re.search(r'(\d+(\.\d+)?)', s)
    if m:
        return float(m.group(1))
    else:
        return None

# Create numeric sentiment columns for each language (we use French as our main example here)
df['sentiment_french_numeric'] = df['sentiment_french'].apply(extract_sentiment_numeric)
df['sentiment_dutch_numeric'] = df['sentiment_dutch'].apply(extract_sentiment_numeric)
df['sentiment_german_numeric'] = df['sentiment_german'].apply(extract_sentiment_numeric)

### 1. Analyze Sentiment by Extracted Time (French)
# Group by the extracted_time_french column and compute the average sentiment.
# (Note: extracted_time_french is a string; in a real scenario, you might need to parse dates.)
time_group = df.groupby('extracted_time_french')['sentiment_french_numeric'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=time_group, x='extracted_time_french', y='sentiment_french_numeric', palette="Blues_d")
plt.title("Average French Sentiment by Extracted Time")
plt.xlabel("Extracted Time")
plt.ylabel("Average Sentiment (stars)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2. Analyze Sentiment by Extracted Location (French)
loc_group = df.groupby('extracted_location_french')['sentiment_french_numeric'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=loc_group, x='extracted_location_french', y='sentiment_french_numeric', palette="Greens_d")
plt.title("Average French Sentiment by Extracted Location")
plt.xlabel("Extracted Location")
plt.ylabel("Average Sentiment (stars)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 3. Analyze Topics and Their Relation to Sentiment (French)
# Assume that topics_french_text is a comma-separated string of topics.
topic_list = []
for topics in df['topics_french_text']:
    if topics and isinstance(topics, str):
        for topic in topics.split(','):
            topic = topic.strip()
            if topic:
                topic_list.append(topic)

# Count frequency of each topic.
topic_counts = Counter(topic_list)
topic_df = pd.DataFrame(topic_counts.items(), columns=['topic', 'count'])
# Take top 10 topics by frequency.
top_topics = topic_df.sort_values(by='count', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_topics, x='topic', y='count', palette="Oranges_d")
plt.title("Top 10 French Topics by Frequency")
plt.xlabel("Topic")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Now, compute average sentiment per topic.
topic_sentiments = {}
for idx, row in df.iterrows():
    sentiment = extract_sentiment_numeric(row['sentiment_french'])
    topics = row['topics_french_text']
    if topics and sentiment is not None:
        for topic in topics.split(','):
            topic = topic.strip()
            if topic:
                topic_sentiments.setdefault(topic, []).append(sentiment)

# Compute the average sentiment for each topic.
topic_avg_sentiment = {topic: sum(vals)/len(vals) for topic, vals in topic_sentiments.items()}
topic_sentiment_df = pd.DataFrame(topic_avg_sentiment.items(), columns=['topic', 'avg_sentiment'])

# Merge with frequency to filter for top topics by frequency.
topic_sentiment_df = topic_sentiment_df.merge(topic_df, on='topic')
# Filter to top 10 topics (by frequency)
top_topic_sentiment = topic_sentiment_df.sort_values(by='count', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_topic_sentiment, x='topic', y='avg_sentiment', palette="Purples_d")
plt.title("Average French Sentiment by Top 10 Topics")
plt.xlabel("Topic")
plt.ylabel("Average Sentiment (stars)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
!pip install wordcloud

In [None]:
!python -m spacy download fr_core_news_sm
!python -m spacy download nl_core_news_sm
!python -m spacy download de_core_news_sm

In [None]:
import spacy
import re
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Load spaCy models for French, Dutch, and German.
nlp_fr = spacy.load("fr_core_news_sm")
nlp_nl = spacy.load("nl_core_news_sm")
nlp_de = spacy.load("de_core_news_sm")

def preprocess_text(text, language):
    """
    Preprocess the text using spaCy:
      - Lowercase
      - Remove punctuation
      - Remove stopwords
      - Lemmatize tokens
    """
    # Select the appropriate spaCy model
    if language == "french":
        nlp = nlp_fr
    elif language == "dutch":
        nlp = nlp_nl
    elif language == "german":
        nlp = nlp_de
    else:
        return text.lower()  # fallback

    # Process the text with spaCy.
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        # Use the lemma and ensure it's lowercase and stripped of extra whitespace.
        lemma = token.lemma_.lower().strip()
        if lemma:
            tokens.append(lemma)
    return " ".join(tokens)

languages = ['french', 'dutch', 'german']

for lang in languages:
    # Combine all improved texts for the current language.
    text_data = " ".join(df[f"improved_{lang}_text"].dropna().tolist())

    # Preprocess the text thoroughly.
    processed_text = preprocess_text(text_data, lang)

    # Generate the word cloud.
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color="white",
        collocations=False,
        stopwords=STOPWORDS  # additional stopwords can be added if needed
    ).generate(processed_text)

    # Plot the word cloud.
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"WordCloud for {lang.capitalize()}")
    plt.show()