In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [92]:
data = pd.read_csv("spam.csv", encoding='latin-1') 
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [93]:
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

print(data.head())
print(data['label'].value_counts())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64


In [94]:
# Chuyển label về 0/1
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})

X = data['message']
y = data['label_num']

# Chia train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Chuẩn hóa văn bản
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [95]:
# Logistic Regression
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train_vec, y_train)

# Dự đoán
y_pred = model.predict(X_test_vec)

# Đánh giá
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9826555023923444
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1453
           1       0.99      0.88      0.93       219

    accuracy                           0.98      1672
   macro avg       0.99      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [96]:
new_emails = [
    "Hi, I have received your email. I will send my assignment on time",
    "Valid 12 hours only."
]

# Biến đổi theo vectorizer cũ
new_vec = vectorizer.transform(new_emails)

# Dự đoán
pred = model.predict(new_vec)

for email, label in zip(new_emails, pred):
    print(f"Email: {email}\n→ Dự đoán: {'spam' if label == 1 else 'ham'}\n")


Email: Hi, I have received your email. I will send my assignment on time
→ Dự đoán: ham

Email: Valid 12 hours only.
→ Dự đoán: ham

