In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spam-sms-detection/spam.csv


In [2]:
import pandas as pd

In [3]:
# Read the CSV file with the specified parameters
df = pd.read_csv("/kaggle/input/spam-sms-detection/spam.csv", sep='\t', encoding='ISO-8859-1')

# Split the 'v1,v2,,,' column into 'label' and 'message' columns based on the comma separator
df[['label', 'message']] = df['v1,v2,,,'].str.split(',', n=1, expand=True)

# Drop the original 'v1,v2,,,' column if needed
df.drop('v1,v2,,,', axis=1, inplace=True)

# Print the first 10 rows to verify the result
df.head(10)

Unnamed: 0,label,message
0,ham,"""Go until jurong point, crazy.. Available only..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"""Nah I don't think he goes to usf, he lives ar..."
5,spam,"""FreeMsg Hey there darling it's been 3 week's ..."
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
bag_of_words = cv.fit_transform(corpus).toarray()

In [8]:
spam_or_not = pd.get_dummies(df['label'])

# Select the values of the second column after one-hot encoding
spam_or_not = spam_or_not.iloc[:, 1].values

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bag_of_words, spam_or_not, test_size = 0.20, random_state = 0)

In [10]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

spam_pred=spam_detect_model.predict(X_test)
spam_pred

array([False, False, False, ..., False, False, False])

In [11]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,spam_pred)
confusion_m 

array([[1112,    3],
       [   0,    0]])

In [12]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,spam_pred)
accuracy

0.9973094170403587