In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = r'C:\Users\alisa\Downloads\preprocessed_data - preprocessed_data.csv'  # Update with the correct file path
data = pd.read_csv(file_path)

# Create a heatmap to visualize the missing data
plt.figure(figsize=(12, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis', yticklabels=False, xticklabels=True)
plt.title('Missing Data Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.show()

In [None]:

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = r'C:\Users\alisa\Downloads\preprocessed_data - preprocessed_data.csv'  # Update with the correct file path
data = pd.read_csv(file_path)

# Split topics by commas and clean up the data
data['dutch_topics_split'] = data['topics_dutch_text'].str.split(',')
data['german_topics_split'] = data['topics_german_text'].str.split(',')
data['french_topics_split'] = data['topics_french_text'].str.split(',')

# Flatten the lists to create a list of individual topics
dutch_topics = [item for sublist in data['dutch_topics_split'].dropna() for item in sublist]
german_topics = [item for sublist in data['german_topics_split'].dropna() for item in sublist]
french_topics = [item for sublist in data['french_topics_split'].dropna() for item in sublist]

# Count the occurrences of each topic
dutch_topic_counts = pd.Series(dutch_topics).value_counts()
german_topic_counts = pd.Series(german_topics).value_counts()
french_topic_counts = pd.Series(french_topics).value_counts()

# Plot the distribution of topics for each language
plt.figure(figsize=(15, 5))

# Dutch Topics
plt.subplot(1, 3, 1)
dutch_topic_counts.head(10).plot(kind='bar', color='lightblue')
plt.title('Top Dutch Topics')
plt.xlabel('Topic')
plt.ylabel('Count')

# German Topics
plt.subplot(1, 3, 2)
german_topic_counts.head(10).plot(kind='bar', color='lightgreen')
plt.title('Top German Topics')
plt.xlabel('Topic')
plt.ylabel('Count')

# French Topics
plt.subplot(1, 3, 3)
french_topic_counts.head(10).plot(kind='bar', color='salmon')
plt.title('Top French Topics')
plt.xlabel('Topic')
plt.ylabel('Count')

# Adjust layout for better display
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob

# Load the dataset
file_path = r'C:\Users\alisa\Downloads\preprocessed_data - preprocessed_data.csv'  # Use raw string to avoid issues with backslashes
data = pd.read_csv(file_path)

# Function to perform sentiment analysis using TextBlob
def analyze_sentiment(text):
    # TextBlob provides a sentiment polarity score; we classify based on the polarity
    if pd.isna(text):
        return 'Neutral'  # Handle NaN values as neutral
    sentiment = TextBlob(text).sentiment.polarity
    if sentiment > 0:
        return 'Positive'
    elif sentiment < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to the texts in Dutch, German, and French
data['dutch_sentiment'] = data['dutch_text'].apply(analyze_sentiment)
data['german_sentiment'] = data['german_text'].apply(analyze_sentiment)
data['french_sentiment'] = data['french_text'].apply(analyze_sentiment)

# Count the occurrences of each sentiment in each language
dutch_sentiment_counts = data['dutch_sentiment'].value_counts()
german_sentiment_counts = data['german_sentiment'].value_counts()
french_sentiment_counts = data['french_sentiment'].value_counts()

# Plot the distribution of sentiments for each language
plt.figure(figsize=(15, 5))

# Dutch Sentiment
plt.subplot(1, 3, 1)
dutch_sentiment_counts.plot(kind='bar', color='lightblue')
plt.title('Sentiment Distribution in Dutch')
plt.xlabel('Sentiment')
plt.ylabel('Count')

# German Sentiment
plt.subplot(1, 3, 2)
german_sentiment_counts.plot(kind='bar', color='lightgreen')
plt.title('Sentiment Distribution in German')
plt.xlabel('Sentiment')
plt.ylabel('Count')

# French Sentiment
plt.subplot(1, 3, 3)
french_sentiment_counts.plot(kind='bar', color='salmon')
plt.title('Sentiment Distribution in French')
plt.xlabel('Sentiment')
plt.ylabel('Count')

# Adjust layout for better display
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA  # Make sure to import PCA
import umap

# Load the dataset
file_path = r'C:\Users\alisa\Downloads\preprocessed_data - preprocessed_data.csv'  # Update with the correct file path
data = pd.read_csv(file_path)

# Combine text data (Dutch, German, French) into one column for analysis
all_text = data['dutch_text'].dropna().tolist() + data['german_text'].dropna().tolist() + data['french_text'].dropna().tolist()

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(all_text)

# Apply LDA for topic modeling
n_topics = 5  # Number of topics
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# Get the topic with the highest probability for each document
topic_assignments = lda.transform(X)
dominant_topics = topic_assignments.argmax(axis=1)

# Reduce the dimensionality of the embeddings to 50 dimensions using PCA before applying UMAP
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(X.toarray())

# Apply UMAP for 2D visualization
umap_model = umap.UMAP(n_components=2, random_state=42)
umap_result = umap_model.fit_transform(reduced_embeddings)

# Plot the UMAP visualization with colors based on topic
plt.figure(figsize=(10, 8))

# Create a scatter plot where each point is colored by its dominant topic
for topic in range(n_topics):
    indices = [i for i, label in enumerate(dominant_topics) if label == topic]
    plt.scatter(umap_result[indices, 0], umap_result[indices, 1], label=f'Topic {topic+1}', alpha=0.5, s=50)

plt.title('UMAP Visualization of Text Embeddings by Topic')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.gensim  # Updated import for pyLDAvis

# Load the dataset
file_path = r'C:\Users\alisa\Downloads\preprocessed_data - preprocessed_data.csv'  # Update with the correct file path
data = pd.read_csv(file_path)

# Combine text data (Dutch, German, French) into one column for analysis
all_text = data['dutch_text'].dropna().tolist() + data['german_text'].dropna().tolist() + data['french_text'].dropna().tolist()

# Vectorize the text using CountVectorizer (you can also use TfidfVectorizer for TF-IDF)
vectorizer = CountVectorizer(stop_words='english', max_features=1000)  # Limit the number of features to avoid sparse matrix issues
X = vectorizer.fit_transform(all_text)

# Apply LDA for topic modeling
n_topics = 5  # You can change the number of topics here
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# Display the top words for each topic
for index, topic in enumerate(lda.components_):
    print(f"Topic {index + 1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])  # Top 10 words per topic
    print()

# Visualize the topics using pyLDAvis
pyLDAvis.enable_notebook()  # Use in Jupyter Notebook (can be omitted if ru


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import umap

# Load the dataset
file_path = r'C:\Users\alisa\Downloads\preprocessed_data - preprocessed_data.csv'
 # Path to the uploaded file
data = pd.read_csv(file_path)

# Function to parse embedding string into numpy array
def parse_embedding(embedding_str):
    return np.fromstring(embedding_str.strip('[]'), sep=' ')

# Extract embeddings for Dutch, German, and French and parse them into arrays
dutch_embeddings = data['dutch_embedding'].dropna().apply(parse_embedding).tolist()
german_embeddings = data['german_embedding'].dropna().apply(parse_embedding).tolist()
french_embeddings = data['french_embedding'].dropna().apply(parse_embedding).tolist()

# Ensure all embeddings are of the same length (find the maximum length)
embedding_length = max(len(embedding) for embedding in dutch_embeddings + german_embeddings + french_embeddings)

# Pad or truncate embeddings to the same length
def adjust_embedding_length(embeddings, target_length):
    return [np.pad(embedding, (0, target_length - len(embedding)), mode='constant') if len(embedding) < target_length else embedding[:target_length] for embedding in embeddings]

dutch_embeddings_adjusted = adjust_embedding_length(dutch_embeddings, embedding_length)
german_embeddings_adjusted = adjust_embedding_length(german_embeddings, embedding_length)
french_embeddings_adjusted = adjust_embedding_length(french_embeddings, embedding_length)

# Combine all embeddings into one array
all_embeddings_adjusted = np.array(dutch_embeddings_adjusted + german_embeddings_adjusted + french_embeddings_adjusted)

# Reduce the dimensionality of the embeddings to 50 dimensions using PCA before applying UMAP
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(all_embeddings_adjusted)

# Apply UMAP for 2D visualization
umap_model = umap.UMAP(n_components=2, random_state=42)
umap_result = umap_model.fit_transform(reduced_embeddings)

# Create a list of labels for each language (Dutch, German, French)
labels = ['Dutch'] * len(dutch_embeddings_adjusted) + ['German'] * len(german_embeddings_adjusted) + ['French'] * len(french_embeddings_adjusted)

# Plot the UMAP visualization with colors based on language
plt.figure(figsize=(10, 8))

# Create a color map based on the labels
colors = {'Dutch': 'blue', 'German': 'green', 'French': 'red'}

# Plotting
for language in set(labels):
    indices = [i for i, label in enumerate(labels) if label == language]
    plt.scatter(umap_result[indices, 0], umap_result[indices, 1], label=language, alpha=0.5, s=50)

plt.title('UMAP Visualization of Text Embeddings by Language')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.legend()
plt.show()

