In [None]:
# Essential imports for data handling and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import re

# NLTK for text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")

# sklearn for model evaluation
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

# PyTorch for deep learning
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Custom utilities (assumed to be in '../src/utils.py')
sys.path.append("../src")
from utils import *

#Set visual style for seaborn
sns.set(rc={'figure.figsize':(15,12)})

# Load and preprocess datasets
pd.set_option('display.max_colwidth', None)

In [None]:
# Load training dataset
data = pd.read_csv('Data.csv', sep=',')
data = data.rename(columns={'text': 'Text', 'ner_tag': 'Tag', 'value': 'Word'})
print(data.head())
print(data.shape)

In [None]:
# Load test dataset
test = pd.read_csv('dev.csv', sep=',')
test = test.rename(columns={'text': 'Text', 'ner_tag': 'Tag', 'value': 'Word'})
print(test.head())
print(test.shape)

## Preprocessing

In [None]:
#Define custom stopwords for traffic-related text in German
traffic_stopwords = {
    'Zwischen', 'über', 'unter', 'bis', 'nach', 'vor', 'seit', 'um', 'während', 'zwölf', 'zwei', 'drei', 
    'vier', 'fünf', 'sechs', 'sieben', 'acht', 'neun', 'zehn', 'elf', 'früher', 'später', 'jetzt', 'heute',
    'morgen', 'tag', 'nacht', 'uhr', 'halb', 'viertel', 'lang', 'lange', 'kurz', 'groß', 'klein', 'weit', 'oben', 
    'unten', 'rechts', 'links', 'nord', 'süd', 'ost', 'west', 'zurück', 'immer', 'manchmal', 'oft', 'selten', 'Wetter', 
    'Regen', 'Schnee', 'Eis', 'Nebel', 'Wind', 'Sonne', 'warm', 'kalt', 'trocken', 'nass'
}

# Function to clean text data
def clean_text(text, custom_stopwords=set()):
    """
    Clean text by removing URLs, hashtags, @mentions, punctuation, emojis, emoticons,
    and words from a custom list of stopwords.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove hashtags and @mentions
    text = re.sub(r'[^\w\s\.]', '', text)  # Remove punctuation (excluding periods)
    text = re.sub(r'[^\w\s\.,!?]', '', text)  # Remove emojis
    words = text.split()  # Split text into words
    words_filtered = [word for word in words if word.lower() not in custom_stopwords]  # Remove custom stopwords
    return " ".join(words_filtered)  # Rejoin words to form the cleaned text

# Clean the 'Text' column in the dataset
data["Text"] = data["Text"].map(lambda x: clean_text(x, traffic_stopwords) if isinstance(x, str) else x)
print(data.head())


In [None]:
# Display the shape of the dataset
print("Dataset shape:", data.shape)

# Display the distribution of categorical features
print("Distribution of 'Tag' values:")
print(data['Tag'].value_counts())

# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())


In [None]:
data.to_csv('Data.csv')

In [None]:
print("Total number of sentences in the dataset: {:,}".format(data["sentence_number"].nunique()))
print("Total words in the dataset: {:,}".format(data.shape[0]))

In [None]:
data["Tag"].value_counts().plot(kind="bar", figsize=(10,5), color = 'teal');


## Word per sentence

In [None]:
word_counts = data.groupby("sentence_number")["Word"].agg(["count"])
word_counts = word_counts.rename(columns={"count": "Word count"})
word_counts.hist(bins=50, figsize=(8,6));


In [None]:
# Tokenizing each sentence in the 'Text' column
data['tokenized_text'] = data['Text'].apply(lambda x: word_tokenize(str(x)))

# Finding the length of each tokenized sentence
data['sentence_length'] = data['tokenized_text'].apply(len)

# Finding the longest sentence length
longest_sentence_length = data['sentence_length'].max()
print("Longest sentence length:", longest_sentence_length)


In [None]:
# Perform descriptive analysis
unique_texts = data['Text'].nunique()
average_length = data['Text'].apply(lambda x: len(x.split())).mean()

# Assess data quality
missing_values = data['Text'].isnull().sum()
duplicate_entries = data['Text'].duplicated().sum()

# Review content by displaying a few random text entries
sample_texts = data['Text'].sample(5).values

# Visualization with a histogram for sentence length
sentence_lengths = data['Text'].apply(lambda x: len(x.split()))
plt.figure(figsize=(12, 6))
sns.histplot(sentence_lengths, bins=30, kde=True)
plt.title("Distribution of Sentence Lengths")
plt.xlabel("Sentence Length (Number of Words)")
plt.ylabel("Frequency")
plt.show()

# Outputs
print(f"Unique texts: {unique_texts}")
print(f"Average sentence length: {average_length} words")
print(f"Missing values in 'text' column: {missing_values}")
print(f"Duplicate entries in 'text' column: {duplicate_entries}")
print(f"Random sample of text entries: {sample_texts}")

In [None]:
# Tokenization: Splitting each entry into words (unigrams)
words = [word for sentence in data['Text'] for word in sentence.split()]

# Counting the frequencies of each unigram
unigram_counts = Counter(words)

# Selecting the top 20 most common unigrams for plotting
most_common_unigrams = unigram_counts.most_common(20)
unigrams, counts = zip(*most_common_unigrams)

# Plotting the unigram frequencies
plt.figure(figsize=(12, 6))
sns.barplot(x=list(unigrams), y=list(counts), color= 'Green')
plt.xticks(rotation=45)
plt.xlabel('Unigrams')
plt.ylabel('Counts')
plt.title('Top 20 Unigrams in the Dataset')
plt.show()


In [None]:
import matplotlib.pyplot as plt

def summary(item_list, limit=None):
    count_dict = dict(Counter(item_list))
    count_items = sorted(count_dict.items(), key=lambda x: x[1], reverse=True)
    print("Number of unique items: ", len(count_items))
    print("Average count: ", round(sum(count_dict.values())/len(count_items), 2),"\n")
    
    total_items = sum(count_dict.values())
    proportions = [(item, count / total_items * 100) for item, count in count_items]

    # Plot only up to 'limit' items if a limit is specified, else plot all
    if limit is not None:
        proportions = proportions[:limit]
    
    labels, sizes = zip(*proportions)

    # Set up the pie chart as a donut chart
    plt.figure(figsize=(16, 8))  # Increase figure size
    plt.pie(sizes, labels=labels, autopct=lambda p: '{:.1f}%'.format(p), startangle=180)
    
    # Draw a circle at the center of pie to make it a donut
    centre_circle = plt.Circle((0, 0), 0.70, color='black', fc='white', linewidth=0)
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.tight_layout()
    plt.show()

summary(data['Tag'])

In [None]:

# Filter the DataFrame for rows where the 'ner_tag' column is 'TRIGGER'
trigger_df = data[data['Tag'] == 'TRIGGER']

# Count the occurrences of each unique 'trigger' word or phrase
trigger_counts = Counter(trigger_df['Word'])

# Select the top 10 (or any number you prefer) most common 'trigger' words or phrases
most_common_triggers = trigger_counts.most_common(10)
triggers, counts = zip(*most_common_triggers)

# Plot the counts of the most frequent 'triggers'
plt.figure(figsize=(10, 6))
sns.barplot(x=list(triggers), y=list(counts))
plt.xticks(rotation=45)
plt.xlabel('Triggers')
plt.ylabel('Counts')
plt.title('Top Trigger Words in the Dataset')
plt.show()

In [None]:
# Generate a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(trigger_counts)

# Display the generated image
plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Remove the axis
plt.show()

In [None]:
#Analyzing tokens
summary(data['Word'], 20)