In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [4]:
data = pd.read_csv('combined_data.csv')
data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [7]:
print(data['label'].value_counts())

label
1    43910
0    39538
Name: count, dtype: int64


In [8]:
# Tải stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Hàm tiền xử lý văn bản
def preprocess_text(text):
    # Chuyển đổi văn bản thành chữ thường
    text = text.lower()
    # Loại bỏ dấu câu
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Loại bỏ stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [10]:
# Áp dụng tiền xử lý cho cột 'text'
data['text'] = data['text'].apply(preprocess_text)

In [11]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [12]:
#Chuyển đổi văn bản thành vector sử dụng TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [18]:
#Xây dựng và đánh giá mô hình Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)

print("Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf, target_names=['not spam', 'spam']))

Random Forest Results:
Accuracy: 0.985380467345716
Classification Report:
               precision    recall  f1-score   support

    not spam       0.99      0.98      0.98      7938
        spam       0.98      0.99      0.99      8752

    accuracy                           0.99     16690
   macro avg       0.99      0.99      0.99     16690
weighted avg       0.99      0.99      0.99     16690



In [19]:
#Xây dựng và đánh giá mô hình Logistic Regression
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, target_names=['not spam', 'spam']))

Logistic Regression Results:
Accuracy: 0.9834631515877771
Classification Report:
               precision    recall  f1-score   support

    not spam       0.99      0.98      0.98      7938
        spam       0.98      0.99      0.98      8752

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690



In [25]:
#Tokenization và Padding cho LSTM
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Chuyển đổi văn bản thành chuỗi số
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Padding để đảm bảo tất cả các chuỗi có cùng độ dài
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

In [26]:
#Xây dựng và huấn luyện mô hình LSTM
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_length))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Early stopping để tránh overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Huấn luyện mô hình
history = model.fit(
    X_train_padded, y_train,
    validation_data=(X_test_padded, y_test),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping]
)



Epoch 1/10
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 91ms/step - accuracy: 0.8440 - loss: 0.3827 - val_accuracy: 0.9250 - val_loss: 0.2353
Epoch 2/10
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 84ms/step - accuracy: 0.9276 - loss: 0.2294 - val_accuracy: 0.9624 - val_loss: 0.1193
Epoch 3/10
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 90ms/step - accuracy: 0.9594 - loss: 0.1348 - val_accuracy: 0.9797 - val_loss: 0.0614
Epoch 4/10
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 84ms/step - accuracy: 0.9827 - loss: 0.0594 - val_accuracy: 0.9747 - val_loss: 0.1027
Epoch 5/10
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 88ms/step - accuracy: 0.9811 - loss: 0.0799 - val_accuracy: 0.9798 - val_loss: 0.0786
Epoch 6/10
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 84ms/step - accuracy: 0.9809 - loss: 0.0755 - val_accuracy: 0.9759 - val_loss: 0.0906


In [27]:
#Đánh giá mô hình LSTM
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Dự đoán trên tập kiểm tra
y_pred_lstm = (model.predict(X_test_padded) > 0.5).astype(int)

# Hiển thị báo cáo phân loại
print("LSTM Classification Report:\n", classification_report(y_test, y_pred_lstm, target_names=['not spam', 'spam']))

[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.9786 - loss: 0.0665
Test Accuracy: 0.9797
[1m522/522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step
LSTM Classification Report:
               precision    recall  f1-score   support

    not spam       0.97      0.98      0.98      7938
        spam       0.98      0.98      0.98      8752

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690



In [28]:
#So sánh kết quả của ba mô hình
results = {
    'Model': ['Random Forest', 'Logistic Regression', 'LSTM'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_lstm)
    ],
    'Precision': [
        precision_score(y_test, y_pred_rf),
        precision_score(y_test, y_pred_lr),
        precision_score(y_test, y_pred_lstm)
    ],
    'Recall': [
        recall_score(y_test, y_pred_rf),
        recall_score(y_test, y_pred_lr),
        recall_score(y_test, y_pred_lstm)
    ],
    'F1-Score': [
        f1_score(y_test, y_pred_rf),
        f1_score(y_test, y_pred_lr),
        f1_score(y_test, y_pred_lstm)
    ]
}

# Hiển thị bảng so sánh
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score
0        Random Forest  0.985380   0.983959  0.988231  0.986091
1  Logistic Regression  0.983463   0.980281  0.988346  0.984297
2                 LSTM  0.979688   0.984229  0.976920  0.980561


In [29]:
# Lưu các mô hình và vectorizer
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(lr_model, 'logistic_regression_model.pkl')
joblib.dump(model, 'lstm_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']

In [30]:
# Dự đoán
emails = [
    "Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim now.",  # Spam
    "Hi John, I hope you are doing well. Let's schedule a meeting for next week to discuss the project."  # Not spam
]

# Tiền xử lý các email
processed_emails = [preprocess_text(email) for email in emails]

# Dự đoán bằng Random Forest
email_tfidf = vectorizer.transform(processed_emails)
rf_predictions = rf_model.predict(email_tfidf)
rf_prediction_labels = ["spam" if pred == 1 else "not spam" for pred in rf_predictions]

# Dự đoán bằng Logistic Regression
lr_predictions = lr_model.predict(email_tfidf)
lr_prediction_labels = ["spam" if pred == 1 else "not spam" for pred in lr_predictions]

# Dự đoán bằng LSTM
email_sequences = tokenizer.texts_to_sequences(processed_emails)
email_padded = pad_sequences(email_sequences, maxlen=max_length, padding='post', truncating='post')
lstm_predictions = (model.predict(email_padded) > 0.5).astype(int)
lstm_prediction_labels = ["spam" if pred[0] == 1 else "not spam" for pred in lstm_predictions]

# Tổng hợp kết quả dự đoán
print("Final Predictions for the Emails:")
for i, email in enumerate(emails):
    print(f"\nEmail {i + 1}:")
    print("Content:", email)
    print("Random Forest Prediction:", rf_prediction_labels[i])
    print("Logistic Regression Prediction:", lr_prediction_labels[i])
    print("LSTM Prediction:", lstm_prediction_labels[i])
    print("-" * 50)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Final Predictions for the Emails:

Email 1:
Content: Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim now.
Random Forest Prediction: spam
Logistic Regression Prediction: spam
LSTM Prediction: spam
--------------------------------------------------

Email 2:
Content: Hi John, I hope you are doing well. Let's schedule a meeting for next week to discuss the project.
Random Forest Prediction: not spam
Logistic Regression Prediction: not spam
LSTM Prediction: not spam
--------------------------------------------------
