In [None]:
import pandas as pd
import numpy as np

In [None]:
df_orig = pd.read_csv("data/spam_utf8.csv")
df_spam = pd.read_csv("data/spam_augmented.csv")
df_spam2= pd.read_csv("data/spam_augmented_v6_streaming.csv")

In [None]:
df_orig.columns

In [None]:
df_spam.columns

In [None]:
df_spam2.columns

In [None]:
# keep only useful columns
df_orig = df_orig[['v1', 'v2']]

# rename text column
df_orig = df_orig.rename(columns={'v2': 'text'})

# convert labels to numbers
df_orig['target'] = df_orig['v1'].map({'ham': 0, 'spam': 1})

# keep final clean columns
df_orig = df_orig[['text', 'target']]


In [None]:
df_spam2['target'].value_counts()


In [None]:
df = pd.concat([df_orig, df_spam,df_spam2], ignore_index=True)

In [None]:
df = df.dropna(subset=['text'])
df['text'] = df['text'].astype(str)


In [None]:
df['target'].value_counts()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=12000,
    ngram_range=(1,2),
    stop_words="english"
)

X = vectorizer.fit_transform(df['text'])
y = df['target']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
import joblib, os

os.makedirs("models_v3", exist_ok=True)

joblib.dump(model, "models_v3/mnb.pkl")
joblib.dump(vectorizer, "models_v3/vectorizer.pkl")