In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('spam.csv')

In [3]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [6]:
X = data['Message']
y = data['Category']

In [7]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [8]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
X_train.shape

(4457,)

In [11]:
X_test.shape

(1115,)

In [12]:
logistic_regression_model = make_pipeline(TfidfVectorizer(), LogisticRegression())
logistic_regression_model.fit(X_train, y_train)

In [13]:
y_pred_lr = logistic_regression_model.predict(X_test)

In [14]:
y_pred_lr

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [15]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy for Logistic Regression:", accuracy_lr)
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr))

Accuracy for Logistic Regression: 0.9748878923766816
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.81      0.90       149

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [16]:
random_forest_model = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
random_forest_model.fit(X_train, y_train)



In [17]:
y_pred_rf = random_forest_model.predict(X_test)

In [18]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy for Random Forest Classifier:", accuracy_rf)
print("Classification Report for Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))

Accuracy for Random Forest Classifier: 0.9838565022421525
Classification Report for Random Forest Classifier:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.88      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [19]:
adaboost_model = make_pipeline(TfidfVectorizer(), AdaBoostClassifier())
adaboost_model.fit(X_train, y_train)

In [20]:

y_pred_ab = adaboost_model.predict(X_test)

In [21]:
accuracy_ab = accuracy_score(y_test, y_pred_ab)
print("Accuracy for AdaBoost Classifier:", accuracy_ab)
print("Classification Report for AdaBoost Classifier:")
print(classification_report(y_test, y_pred_ab))

Accuracy for AdaBoost Classifier: 0.979372197309417
Classification Report for AdaBoost Classifier:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.87      0.92       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [22]:
knn_model = make_pipeline(TfidfVectorizer(), KNeighborsClassifier())
knn_model.fit(X_train, y_train)


In [23]:
y_pred_knn = knn_model.predict(X_test)

In [24]:
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy for KNN Classifier:", accuracy_knn)
print("Classification Report for KNN Classifier:")
print(classification_report(y_test, y_pred_knn))

Accuracy for KNN Classifier: 0.9192825112107623
Classification Report for KNN Classifier:
              precision    recall  f1-score   support

         ham       0.91      1.00      0.96       966
        spam       1.00      0.40      0.57       149

    accuracy                           0.92      1115
   macro avg       0.96      0.70      0.76      1115
weighted avg       0.93      0.92      0.90      1115

