In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Loading the dataset
df = pd.read_csv("C:\\Users\\gsury\Downloads\spam.csv", encoding='latin-1')
df = df[['v1', 'v2']] 

df.columns = ['label', 'text']



In [2]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [8]:
df['label'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Initializing the CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [10]:
print(X_train_vectorized)

  (0, 4773)	1
  (0, 3637)	1
  (0, 6786)	1
  (0, 5872)	1
  (0, 1415)	1
  (0, 6455)	1
  (0, 3416)	1
  (0, 1105)	1
  (0, 4646)	1
  (0, 4549)	1
  (0, 1758)	1
  (0, 4390)	1
  (0, 4988)	1
  (0, 4912)	1
  (0, 7674)	1
  (0, 3308)	1
  (0, 4661)	1
  (1, 1218)	1
  (1, 4869)	1
  (1, 3237)	1
  (1, 3749)	1
  (1, 6494)	1
  (1, 1580)	1
  (1, 2741)	1
  (1, 5178)	1
  :	:
  (4452, 1833)	1
  (4452, 1628)	1
  (4452, 3532)	1
  (4452, 3509)	1
  (4453, 1608)	1
  (4453, 4147)	1
  (4453, 4019)	1
  (4453, 6299)	1
  (4454, 3383)	1
  (4454, 5335)	1
  (4454, 6306)	1
  (4454, 5590)	1
  (4454, 3028)	1
  (4454, 6305)	1
  (4455, 3841)	1
  (4455, 4827)	1
  (4455, 6002)	1
  (4455, 6922)	1
  (4456, 3637)	2
  (4456, 4823)	1
  (4456, 3182)	1
  (4456, 3361)	1
  (4456, 7558)	1
  (4456, 2836)	1
  (4456, 4464)	1


In [11]:
print(X_test_vectorized)

  (0, 1808)	1
  (0, 2759)	1
  (0, 3067)	1
  (0, 3337)	1
  (0, 3567)	1
  (0, 3873)	1
  (0, 4685)	1
  (0, 4773)	1
  (0, 4776)	1
  (0, 7040)	1
  (0, 7482)	1
  (1, 923)	1
  (1, 955)	1
  (1, 1036)	1
  (1, 1987)	1
  (1, 2023)	1
  (1, 2375)	2
  (1, 2731)	1
  (1, 2836)	1
  (1, 2954)	1
  (1, 3298)	1
  (1, 3415)	1
  (1, 3600)	1
  (1, 3739)	1
  (1, 3837)	1
  :	:
  (1111, 7152)	1
  (1111, 7386)	1
  (1111, 7425)	1
  (1112, 3361)	1
  (1112, 4495)	1
  (1112, 4614)	1
  (1112, 6251)	2
  (1112, 6763)	1
  (1112, 6781)	1
  (1112, 7475)	1
  (1112, 7662)	1
  (1112, 7679)	1
  (1113, 2422)	1
  (1113, 3129)	1
  (1113, 3416)	1
  (1113, 6786)	1
  (1114, 305)	1
  (1114, 1292)	1
  (1114, 1353)	1
  (1114, 1454)	1
  (1114, 3361)	1
  (1114, 4235)	1
  (1114, 4723)	1
  (1114, 5247)	1
  (1114, 7679)	1


In [7]:
model = MultinomialNB()

# Training the model on the training data
model.fit(X_train_vectorized, y_train)

# Making predictions on the test data
y_pred = model.predict(X_test_vectorized)

In [8]:
print(y_pred)

[0 0 1 ... 0 0 1]


In [9]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


In [12]:
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.98


In [13]:
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[963   2]
 [ 16 134]]


In [14]:
print("Classification Report:")
print(classification_rep)


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

