In [1]:
import numpy as np
import pandas as pd
import spacy
from gensim.models import KeyedVectors
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense
from sklearn.metrics import classification_report

# Load your dataset
data_df = pd.read_csv('data-en-hi-de-fr.csv')
data_df.dropna(inplace=True)
data_df.drop_duplicates(inplace=True)
data_df.rename(columns={"Category": "labels", "Message": "text"}, inplace=True)

# Label encoding
le = LabelEncoder()
data_df['labels'] = le.fit_transform(data_df.labels)

# Reset indices after preprocessing to ensure alignment
data_df.reset_index(drop=True, inplace=True)



In [2]:
# Load spacy models for different languages
nlp_en = spacy.load('en_core_web_sm')
nlp_fr = spacy.load('fr_core_news_sm')
nlp_de = spacy.load('de_core_news_sm')

def preprocess_text(text, nlp):
    doc = nlp(text.lower().strip())
    return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.like_num])

# Apply preprocessing
data_df['processed_text_en'] = data_df['text'].apply(preprocess_text, nlp=nlp_en)
data_df['processed_text_fr'] = data_df['text_fr'].apply(preprocess_text, nlp=nlp_fr)
data_df['processed_text_de'] = data_df['text_de'].apply(preprocess_text, nlp=nlp_de)


In [3]:
tokenizer_en = Tokenizer(num_words=5000)
tokenizer_fr = Tokenizer(num_words=5000)
tokenizer_de = Tokenizer(num_words=5000)

tokenizer_en.fit_on_texts(data_df['processed_text_en'])
tokenizer_fr.fit_on_texts(data_df['processed_text_fr'])
tokenizer_de.fit_on_texts(data_df['processed_text_de'])

sequences_en = tokenizer_en.texts_to_sequences(data_df['processed_text_en'])
sequences_fr = tokenizer_fr.texts_to_sequences(data_df['processed_text_fr'])
sequences_de = tokenizer_de.texts_to_sequences(data_df['processed_text_de'])

max_sequence_len = 150
X_seq_en = pad_sequences(sequences_en, maxlen=max_sequence_len)
X_seq_fr = pad_sequences(sequences_fr, maxlen=max_sequence_len)
X_seq_de = pad_sequences(sequences_de, maxlen=max_sequence_len)


In [4]:
def load_embeddings(path):
    if 'glove' in path:
        return KeyedVectors.load_word2vec_format(path, binary=False, unicode_errors='ignore', no_header=True)
    return KeyedVectors.load_word2vec_format(path, binary=False, unicode_errors='ignore')

embeddings_en = load_embeddings('glove.6B.100d.txt')  # Assuming GloVe for English
embeddings_fr = load_embeddings('cc.fr.300.vec')      # Assuming fastText for French
embeddings_de = load_embeddings('cc.de.300.vec')      # Assuming fastText for German

In [5]:
def get_embedding_matrix(embeddings, tokenizer, embedding_dim):
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
    for word, i in tokenizer.word_index.items():
        try:
            embedding_vector = embeddings.get_vector(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix

embedding_matrix_en = get_embedding_matrix(embeddings_en, tokenizer_en, 100)  # GloVe dimensions
embedding_matrix_fr = get_embedding_matrix(embeddings_fr, tokenizer_fr, 300)  # fastText dimensions
embedding_matrix_de = get_embedding_matrix(embeddings_de, tokenizer_de, 300)  # fastText dimensions

In [7]:
def build_model(embedding_matrix, max_length, lstm_units):
    model = Sequential([
        Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1], input_length=max_length, trainable=False),
        Dropout(0.2),
        LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Split data
train_x, test_x, train_y, test_y = train_test_split(X_seq_en, data_df.labels, test_size=0.3, random_state=123)
train_x_fr, test_x_fr, train_y_fr, test_y_fr = train_test_split(X_seq_fr, data_df.labels, test_size=0.3, random_state=123)
train_x_de, test_x_de, train_y_de, test_y_de = train_test_split(X_seq_de, data_df.labels, test_size=0.3, random_state=123)

# Build models
model_en = build_model(embedding_matrix_en, max_sequence_len, 64)
model_fr = build_model(embedding_matrix_fr, max_sequence_len, 64)
model_de = build_model(embedding_matrix_de, max_sequence_len, 64)

# Train and evaluate
model_en.fit(train_x, train_y, epochs=5, validation_split=0.2)
model_fr.fit(train_x_fr, train_y_fr, epochs=5, validation_split=0.2)
model_de.fit(train_x_de, train_y_de, epochs=5, validation_split=0.2)

# Example for evaluation - can add detailed evaluation as necessary
print("Evaluation for English Model:")
model_en.evaluate(test_x, test_y)
print("Evaluation for French Model:")
model_fr.evaluate(test_x_fr, test_y_fr)
print("Evaluation for German Model:")
model_de.evaluate(test_x_de, test_y_de)


Epoch 1/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 133ms/step - accuracy: 0.8785 - loss: 0.4448 - val_accuracy: 0.8476 - val_loss: 0.3189
Epoch 2/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 115ms/step - accuracy: 0.8825 - loss: 0.2841 - val_accuracy: 0.8906 - val_loss: 0.2861
Epoch 3/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 167ms/step - accuracy: 0.8917 - loss: 0.2548 - val_accuracy: 0.8906 - val_loss: 0.2780
Epoch 4/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 164ms/step - accuracy: 0.8981 - loss: 0.2364 - val_accuracy: 0.8920 - val_loss: 0.2682
Epoch 5/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 150ms/step - accuracy: 0.9148 - loss: 0.2252 - val_accuracy: 0.8809 - val_loss: 0.2626
Epoch 1/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 167ms/step - accuracy: 0.8810 - loss: 0.4030 - val_accuracy: 0.8837 - val_loss: 0.2658
Epoch 2/5
[1m91/91[0m [32

[0.16990873217582703, 0.9392764568328857]