Installing Libraries

In [1]:
!pip install fasttext
!pip install numpy==1.24.4

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313503 sha256=908df98189c5a37d3319bd0610045ed025707f52e27d24bd09e7707268ae1421
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

Importing Libraries

In [1]:
import fasttext
import numpy as np
import pandas as pd

Training model

In [2]:
# Training FastText model
fasttext_model = fasttext.train_supervised('fasttext_data.txt', epoch=25, lr=1, wordNgrams=2)

# Save the model
fasttext_model.save_model("spam_fasttext_model.bin")

# Testing the model
result = fasttext_model.test('fasttext_test_data.txt')
print(f"Test samples: {result[0]}")
print(f"Precision = Recall = Accuracy: {result[1]:.4f}")

Test samples: 190
Precision = Recall = Accuracy: 0.8579


In [3]:
# Unsupervised training for other models
ft_model = fasttext.train_unsupervised("corpus.txt", model='skipgram')

In [4]:
def get_sentence_vector(text):
    return ft_model.get_sentence_vector(text)

In [5]:
df = pd.read_csv('spam_data.csv')[['text','BinaryLabel']]
X = np.array([get_sentence_vector(text) for text in df['text']])
y = df['BinaryLabel'].values

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [7]:
train_on_custom_data = False
if not train_on_custom_data:
  X_train = X
  y_train = y
  df_test = pd.read_csv('spam_test_data.csv')
  X_test = np.array([get_sentence_vector(text) for text in df_test['text']])
  y_test = df_test['BinaryLabel'].values
else:
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        95
           1       1.00      0.91      0.95        95

    accuracy                           0.95       190
   macro avg       0.96      0.95      0.95       190
weighted avg       0.96      0.95      0.95       190



In [9]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=15, random_state=1)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        95
           1       1.00      0.95      0.97        95

    accuracy                           0.97       190
   macro avg       0.97      0.97      0.97       190
weighted avg       0.97      0.97      0.97       190



In [10]:
# Support Vector Machine
from sklearn.svm import SVC

svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        95
           1       1.00      0.94      0.97        95

    accuracy                           0.97       190
   macro avg       0.97      0.97      0.97       190
weighted avg       0.97      0.97      0.97       190



In [11]:
# XGB
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        95
           1       1.00      0.98      0.99        95

    accuracy                           0.99       190
   macro avg       0.99      0.99      0.99       190
weighted avg       0.99      0.99      0.99       190



Saving other models

In [12]:
import joblib
joblib.dump(lr_model, 'lr_model.pkl')
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(xgb_model, 'xgb_model.pkl')

['xgb_model.pkl']

Testing all models together

In [None]:
# Predict label for new message
msg = "Hi! We noticed you haven’t used your rewards points in a while. Redeem them now and enjoy exclusive member benefits. No obligations. Click here to view your balance."
label, confidence = fasttext_model.predict(msg)
print(f"Label: {label[0]}, Confidence: {confidence[0]:.4f}")
print(lr_model.predict_proba([get_sentence_vector(msg)]))
print(rf_model.predict_proba([get_sentence_vector(msg)]))
print(svm_model.predict_proba([get_sentence_vector(msg)]))
print(xgb_model.predict_proba([get_sentence_vector(msg)]))


Label: __label__ham, Confidence: 0.7553
[[0.76170882 0.23829118]]
[[0.73333333 0.26666667]]
[[0.7405523 0.2594477]]
[[0.9935634  0.00643658]]
