In [558]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [559]:
dataset = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding = 'latin')


In [560]:
print(dataset.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [561]:
dataset = pd.DataFrame(dataset, columns=['v1', 'v2'])
dataset = dataset.rename(columns={'v1': 'Identifier', 'v2': 'Message'})
print(dataset.columns)
print(dataset.head())

Index(['Identifier', 'Message'], dtype='object')
  Identifier                                            Message
0        ham  Go until jurong point, crazy.. Available only ...
1        ham                      Ok lar... Joking wif u oni...
2       spam  Free entry in 2 a wkly comp to win FA Cup fina...
3        ham  U dun say so early hor... U c already then say...
4        ham  Nah I don't think he goes to usf, he lives aro...


In [562]:
# Check the number of rows and columns
rows, columns = dataset.shape

# Print the results
print(f"Number of rows: {rows}")
print(f"Number of columns: {columns}")

Number of rows: 5572
Number of columns: 2


In [563]:
dataset.loc[dataset['Identifier'] == 'spam', 'Identifier',] = 0
dataset.loc[dataset['Identifier'] == 'ham', 'Identifier',] = 1

In [564]:
print(dataset)

     Identifier                                            Message
0             1  Go until jurong point, crazy.. Available only ...
1             1                      Ok lar... Joking wif u oni...
2             0  Free entry in 2 a wkly comp to win FA Cup fina...
3             1  U dun say so early hor... U c already then say...
4             1  Nah I don't think he goes to usf, he lives aro...
...         ...                                                ...
5567          0  This is the 2nd time we have tried 2 contact u...
5568          1              Will Ì_ b going to esplanade fr home?
5569          1  Pity, * was in mood for that. So...any other s...
5570          1  The guy did some bitching but I acted like i'd...
5571          1                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [565]:
import re
#separating the data as texts and label
x = dataset['Message']
y = dataset['Identifier']
#Removing og punctuation marks
for i in range (5572):
    x[i]=re.sub(r"[^a-zA-Z0-9\s]","",x[i])
x

0       Go until jurong point crazy Available only in ...
1                                 Ok lar Joking wif u oni
2       Free entry in 2 a wkly comp to win FA Cup fina...
3             U dun say so early hor U c already then say
4       Nah I dont think he goes to usf he lives aroun...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                   Will  b going to esplanade fr home
5569    Pity  was in mood for that Soany other suggest...
5570    The guy did some bitching but I acted like id ...
5571                            Rofl Its true to its name
Name: Message, Length: 5572, dtype: object

In [566]:
print(x)

0       Go until jurong point crazy Available only in ...
1                                 Ok lar Joking wif u oni
2       Free entry in 2 a wkly comp to win FA Cup fina...
3             U dun say so early hor U c already then say
4       Nah I dont think he goes to usf he lives aroun...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                   Will  b going to esplanade fr home
5569    Pity  was in mood for that Soany other suggest...
5570    The guy did some bitching but I acted like id ...
5571                            Rofl Its true to its name
Name: Message, Length: 5572, dtype: object


In [567]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 69)

In [568]:
#vectorizataion
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [569]:
print(x_train)

4096                           i am going to bed now prin
866     Same here but I consider walls and bunkers and...
1732                            K can that happen tonight
1260    Yo Im at my parents gettin cash Good news we p...
5415    You should get more chicken broth if you want ...
                              ...                        
3633    Its a big difference  ltgt  versus  ltgt  ever...
439         But i have to I like to have love and arrange
1626                              Dear how you Are you ok
2667                         Why de You looking good only
4041                               What time do u get out
Name: Message, Length: 5014, dtype: object


In [570]:
print(x_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 38315 stored elements and shape (5014, 8524)>
  Coords	Values
  (0, 5956)	0.7546490176243329
  (0, 1436)	0.5321376348594697
  (0, 3470)	0.3838416310990759
  (1, 4834)	0.32546807303051384
  (1, 2677)	0.16282219001942044
  (1, 3756)	0.3030757152570289
  (1, 5761)	0.22207249533721837
  (1, 3584)	0.23754875426732644
  (1, 5659)	0.33697932777034834
  (1, 5775)	0.24103041065715688
  (1, 4256)	0.14889521662262864
  (1, 3995)	0.2572546129780365
  (1, 6681)	0.24103041065715688
  (1, 1724)	0.35320353009122785
  (1, 8064)	0.33697932777034834
  (1, 2192)	0.33697932777034834
  (2, 7636)	0.6520306535036521
  (2, 3643)	0.7581926054055133
  (3, 2700)	0.45569586891273756
  (3, 5732)	0.37804789878642286
  (3, 5237)	0.35167653461397286
  (3, 3484)	0.2182128084691677
  (3, 1862)	0.27759032030928643
  (3, 3412)	0.37260866395061815
  (3, 5605)	0.3590641132424909
  :	:
  (5008, 8356)	0.332011238362873
  (5008, 6268)	0.332011238362873
  (5008, 7634

In [571]:
#traing the model
model = LogisticRegression()

In [572]:
model.fit(x_train_features, y_train)

In [573]:
#evaluating
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)


In [574]:
print('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9680893498205025


In [575]:
#prediciting on test data
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [576]:
print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.96415770609319


In [None]:
a=int(input("Enter the no of sms :"))
for i in range(a):
    input_data=input("Enter the string :")
    input_feature=feature_extraction.transform([input_data])
    prediction=model.predict(input_feature)
    if prediction[0]==1:
        print("Ham")
    else:
        print("Spam")