In [1]:
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, LSTM, Dropout



In [2]:
data_df = pd.read_csv('data-en-hi-de-fr.csv')
data_df.dropna(inplace=True)
data_df.drop_duplicates(inplace=True)
data_df.rename(columns={
    "Category": "labels",
    "Message": "text"
}, inplace=True)

data_df

Unnamed: 0,labels,text,text_hi,text_de,text_fr
0,ham,"Go until jurong point, crazy.. Available only ...","Dakag बिंदु तक जाओ, पागल. केवल Bag Non महान वि...","Gehen Sie bis jurong Punkt, verrückt.. Verfügb...","Allez jusqu'à Jurong point, fou.. Disponible s..."
1,ham,Ok lar... Joking wif u oni...,ओके लामर.... if if uue पर.,Ok Lar... joking wif u oni...,J'ai fait une blague sur le wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Fktatatat 21 मई को प्राप्त करने के लिए मुफ्त प...,Freier Eintritt in 2 a wkly comp zum Gewinn FA...,Entrée libre dans 2 a wkly comp pour gagner FA...
3,ham,U dun say so early hor... U c already then say...,Uden इतना जल्दी कहते हैं... तो पहले से ही यूसी...,U dun sagen so früh... U c schon dann sagen...,U dun dit si tôt hor... U c déjà dire alors...
4,ham,"Nah I don't think he goes to usf, he lives aro...","नहीं, मुझे नहीं लगता कि वह हमारे लिए चला जाता ...","Nein, ich glaube nicht, dass er zu unsf geht, ...","Non, je ne pense pas qu'il va à usf, il vit da..."
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,यह 2 सेकंड है जब हमने 2 संपर्क की कोशिश की है....,"Dies ist das zweite Mal, dass wir versucht hab...",C'est la 2ème fois que nous avons essayé 2 con...
5568,ham,Will ü b going to esplanade fr home?,क्या कलाई घर का पता लगाने के लिए जा रही होगी?,"Wird u b gehen, um esplanade fr home?",Est-ce que ü b ira à l'esplanade en maison?
5569,ham,"Pity, * was in mood for that. So...any other s...","तो फिर, दूसरे सुझाव क्या हैं?","Schade, * war in Stimmung dafür. Also... irgen...","Dommage, * était d'humeur pour ça. Donc... d'a..."
5570,ham,The guy did some bitching but I acted like i'd...,आदमी कुछ कुतियािंग किया लेकिन मैं मैं कुछ और ख...,"Der Typ hat ein bisschen rumgeschnüffelt, aber...",Le type a fait une saloperie mais j'ai agi com...


In [3]:
le = LabelEncoder()
le.fit(data_df.labels)
data_df["labels"] = le.transform(data_df.labels)
data_df

Unnamed: 0,labels,text,text_hi,text_de,text_fr
0,0,"Go until jurong point, crazy.. Available only ...","Dakag बिंदु तक जाओ, पागल. केवल Bag Non महान वि...","Gehen Sie bis jurong Punkt, verrückt.. Verfügb...","Allez jusqu'à Jurong point, fou.. Disponible s..."
1,0,Ok lar... Joking wif u oni...,ओके लामर.... if if uue पर.,Ok Lar... joking wif u oni...,J'ai fait une blague sur le wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Fktatatat 21 मई को प्राप्त करने के लिए मुफ्त प...,Freier Eintritt in 2 a wkly comp zum Gewinn FA...,Entrée libre dans 2 a wkly comp pour gagner FA...
3,0,U dun say so early hor... U c already then say...,Uden इतना जल्दी कहते हैं... तो पहले से ही यूसी...,U dun sagen so früh... U c schon dann sagen...,U dun dit si tôt hor... U c déjà dire alors...
4,0,"Nah I don't think he goes to usf, he lives aro...","नहीं, मुझे नहीं लगता कि वह हमारे लिए चला जाता ...","Nein, ich glaube nicht, dass er zu unsf geht, ...","Non, je ne pense pas qu'il va à usf, il vit da..."
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,यह 2 सेकंड है जब हमने 2 संपर्क की कोशिश की है....,"Dies ist das zweite Mal, dass wir versucht hab...",C'est la 2ème fois que nous avons essayé 2 con...
5568,0,Will ü b going to esplanade fr home?,क्या कलाई घर का पता लगाने के लिए जा रही होगी?,"Wird u b gehen, um esplanade fr home?",Est-ce que ü b ira à l'esplanade en maison?
5569,0,"Pity, * was in mood for that. So...any other s...","तो फिर, दूसरे सुझाव क्या हैं?","Schade, * war in Stimmung dafür. Also... irgen...","Dommage, * était d'humeur pour ça. Donc... d'a..."
5570,0,The guy did some bitching but I acted like i'd...,आदमी कुछ कुतियािंग किया लेकिन मैं मैं कुछ और ख...,"Der Typ hat ein bisschen rumgeschnüffelt, aber...",Le type a fait une saloperie mais j'ai agi com...


In [4]:
# Reset indices after preprocessing to ensure alignment
data_df.reset_index(drop=True, inplace=True)

train_x, test_x, train_y, test_y = train_test_split(data_df.text, data_df.labels, stratify=data_df.labels, test_size=0.3, random_state=123)
train_x_fr, test_x_fr, train_y_fr, test_y_fr = train_test_split(data_df.text_fr, data_df.labels, stratify=data_df.labels, test_size=0.3, random_state=123)
train_x_de, test_x_de, train_y_de, test_y_de = train_test_split(data_df.text_de, data_df.labels, stratify=data_df.labels, test_size=0.3, random_state=123)

In [5]:
# Load spacy models for French and German
nlp_fr = spacy.load('fr_core_news_sm')
nlp_de = spacy.load('de_core_news_sm')

def preprocess_text(text, nlp):
    doc = nlp(text.lower().strip())
    return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.like_num])

data_df['processed_text_fr'] = data_df['text_fr'].apply(preprocess_text, nlp=nlp_fr)
data_df['processed_text_de'] = data_df['text_de'].apply(preprocess_text, nlp=nlp_de)

data_df

In [6]:
# Tokenizer setup
tokenizer_fr = Tokenizer(num_words=5000)
tokenizer_de = Tokenizer(num_words=5000)

# Fit on the processed texts
tokenizer_fr.fit_on_texts(data_df['processed_text_fr'])
tokenizer_de.fit_on_texts(data_df['processed_text_de'])

# Convert texts to sequences
sequences_fr = tokenizer_fr.texts_to_sequences(data_df['processed_text_fr'])
sequences_de = tokenizer_de.texts_to_sequences(data_df['processed_text_de'])

# Padding sequences to the same length
max_sequence_len = 150
X_seq_fr = pad_sequences(sequences_fr, maxlen=max_sequence_len)
X_seq_de = pad_sequences(sequences_de, maxlen=max_sequence_len)

# Splitting the data using the already defined splits
X_train_seq_fr = X_seq_fr[train_x_fr.index]
X_test_seq_fr = X_seq_fr[test_x_fr.index]
X_train_seq_de = X_seq_de[train_x_de.index]
X_test_seq_de = X_seq_de[test_x_de.index]

In [7]:
# Vectorizer for French and German
vectorizer_fr = CountVectorizer(max_features=5000)
vectorizer_de = CountVectorizer(max_features=5000)

# Fit and transform the data
X_vect_fr = vectorizer_fr.fit_transform(data_df['processed_text_fr'])
X_vect_de = vectorizer_de.fit_transform(data_df['processed_text_de'])

# Use indices to get train and test sets
X_train_vect_fr = X_vect_fr[train_x_fr.index]
X_test_vect_fr = X_vect_fr[test_x_fr.index]
X_train_vect_de = X_vect_de[train_x_de.index]
X_test_vect_de = X_vect_de[test_x_de.index]

# Initialize and train Naive Bayes for French
nb_model_fr = MultinomialNB()
nb_model_fr.fit(X_train_vect_fr, train_y_fr)
y_pred_fr = nb_model_fr.predict(X_test_vect_fr)
print("French Naive Bayes Classification Report:")
print(classification_report(test_y_fr, y_pred_fr))

# Initialize and train Naive Bayes for German
nb_model_de = MultinomialNB()
nb_model_de.fit(X_train_vect_de, train_y_de)
y_pred_de = nb_model_de.predict(X_test_vect_de)
print("German Naive Bayes Classification Report:")
print(classification_report(test_y_de, y_pred_de))

French Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1356
           1       0.87      0.89      0.88       192

    accuracy                           0.97      1548
   macro avg       0.93      0.94      0.93      1548
weighted avg       0.97      0.97      0.97      1548

German Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1356
           1       0.89      0.90      0.89       192

    accuracy                           0.97      1548
   macro avg       0.94      0.94      0.94      1548
weighted avg       0.97      0.97      0.97      1548



In [8]:
def build_and_train_rnn(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=64, input_length=150))
    model.add(Dropout(0.2))  # Dropout for input layer
    model.add(SimpleRNN(64, dropout=0.2))  # Applying dropout to RNN
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {accuracy*100:.2f}%")
    
# Train and evaluate for French
print("Training and evaluating RNN for French:")
build_and_train_rnn(X_train_seq_fr, train_y_fr, X_test_seq_fr, test_y_fr)

# Train and evaluate for German
print("Training and evaluating RNN for German:")
build_and_train_rnn(X_train_seq_de, train_y_de, X_test_seq_de, test_y_de)

Training and evaluating RNN for French:
Epoch 1/5




[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.8738 - loss: 0.4073 - val_accuracy: 0.9474 - val_loss: 0.1934
Epoch 2/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9651 - loss: 0.1336 - val_accuracy: 0.9751 - val_loss: 0.0965
Epoch 3/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9835 - loss: 0.0636 - val_accuracy: 0.9806 - val_loss: 0.0713
Epoch 4/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9917 - loss: 0.0412 - val_accuracy: 0.9820 - val_loss: 0.0605
Epoch 5/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9954 - loss: 0.0196 - val_accuracy: 0.9778 - val_loss: 0.0737
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9725 - loss: 0.0776
Test Accuracy: 97.42%
Training and evaluating RNN for German:
Epoch 1/5
[1m46/46[0m [32m━━━━━━━━━━━━

In [9]:
def build_and_train_lstm(X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=64, input_length=150))
    model.add(Dropout(0.2))  # Dropout on input layer
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))  # Dropout within LSTM layer
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {accuracy*100:.2f}%")

# Train and evaluate for French
build_and_train_lstm(X_train_seq_fr, train_y_fr, X_test_seq_fr, test_y_fr)

# Train and evaluate for German
build_and_train_lstm(X_train_seq_de, train_y_de, X_test_seq_de, test_y_de)

Epoch 1/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 94ms/step - accuracy: 0.8572 - loss: 0.4227 - val_accuracy: 0.9446 - val_loss: 0.2065
Epoch 2/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 87ms/step - accuracy: 0.9518 - loss: 0.1721 - val_accuracy: 0.9737 - val_loss: 0.1204
Epoch 3/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 89ms/step - accuracy: 0.9794 - loss: 0.0806 - val_accuracy: 0.9848 - val_loss: 0.0674
Epoch 4/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 93ms/step - accuracy: 0.9928 - loss: 0.0370 - val_accuracy: 0.9834 - val_loss: 0.0507
Epoch 5/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 89ms/step - accuracy: 0.9932 - loss: 0.0246 - val_accuracy: 0.9875 - val_loss: 0.0441
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9824 - loss: 0.0679
Test Accuracy: 98.00%
Epoch 1/5
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s