In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('/content/spam.csv', encoding='latin-1')

df.columns = df.columns.str.strip()

print(df.info())
print()

df=df[['v1','v2']]
df.columns=['label','message']
le=LabelEncoder()
df['label']=le.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

vectorizer=TfidfVectorizer(stop_words='english',max_features=3000)
x_train_tfidf=vectorizer.fit_transform(X_train)
x_test_tfidf=vectorizer.transform(X_test)

LE=LogisticRegression()
LE.fit(x_train_tfidf,y_train)
y_pred=LE.predict(x_test_tfidf)
accuracy=accuracy_score(y_test,y_pred)
print(f'Accuracy:{accuracy*100:.2f}%')
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB
None

Accuracy:96.41%

Classification Report:
               precision    recall  f1-score   support

         Ham       0.96      1.00      0.98       965
        Spam       0.97      0.75      0.85       150

    accuracy                           0.96      1115
   macro avg       0.97      0.88      0.91      1115
weighted avg       0.96      0.96      0.96      1115


Confusion Matrix:
 [[962   3]
 [ 37 113]]


Sample Data Testing

In [5]:
sample_data = ["Congratulations! You've won a free iPhone. Click here to claim now!"]
sample_tfidf = vectorizer.transform(sample_data)
pred = LE.predict(sample_tfidf)
pred_label = le.inverse_transform(pred)[0]

print("Message:", sample_data[0])
print("Predicted Label:", pred_label)

Message: Congratulations! You've won a free iPhone. Click here to claim now!
Predicted Label: spam
