In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, LSTM, Dropout

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Load dataset
data_df = pd.read_csv('data-en-hi-de-fr.csv')
data_df.dropna(inplace=True)
data_df.drop_duplicates(inplace=True)
data_df.rename(columns={
    "Category": "labels",
    "Message": "text"
}, inplace=True)

data_df

In [None]:
le = LabelEncoder()
le.fit(data_df.labels)
data_df["labels"] = le.transform(data_df.labels)
data_df

In [None]:
train_x, test_x, train_y, test_y = train_test_split(data_df.text, data_df.labels, stratify=data_df.labels, test_size=0.3, random_state=123)
train_x_fr, test_x_fr, train_y_fr, test_y_fr = train_test_split(data_df.text_fr, data_df.labels, stratify=data_df.labels, test_size=0.3, random_state=123)
train_x_de, test_x_de, train_y_de, test_y_de = train_test_split(data_df.text_de, data_df.labels, stratify=data_df.labels, test_size=0.3, random_state=123)

In [None]:
# Tokenize text for RNN/LSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_x)

X_train_seq = tokenizer.texts_to_sequences(train_x)
X_test_seq = tokenizer.texts_to_sequences(test_x)

# Pad sequences for RNN/LSTM
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

In [None]:
# NAIVE BAYES

# Vectorize text and build pipeline
nb_pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

# Perform K-fold cross validation
scores = cross_val_score(nb_pipeline, train_x, train_y, cv=10)
print(f'Accuracy: {scores.mean()}')

In [None]:
# RNN
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=5000, output_dim=64))
model_rnn.add(SimpleRNN(units=64))
model_rnn.add(Dense(units=1, activation='sigmoid'))

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, train_y, epochs=10, batch_size=64, validation_split=0.2)

In [None]:
# LSTM
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=5000, output_dim=64))
model_lstm.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(units=1, activation='sigmoid'))

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_train_pad, train_y, epochs=10, batch_size=64, validation_split=0.2)

In [None]:
# EVALUATION

# Predictions
y_pred_nb = nb_pipeline.predict(test_x)
y_pred_rnn = (model_rnn.predict(X_test_pad) > 0.5).astype("int32")
y_pred_lstm = (model_lstm.predict(X_test_pad) > 0.5).astype("int32")

# Confusion matrix and classification report
def evaluate_model(y_true, y_pred, model_name):
    print(f"Results for {model_name}:")
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))

evaluate_model(test_y, y_pred_nb, 'Naive Bayes')
evaluate_model(test_y, y_pred_rnn, 'RNN')
evaluate_model(test_y, y_pred_lstm, 'LSTM')