# **1. Business & data Understanding :-**

**Source** - [click me](https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset)

# **2. Data Preparation :-**

In [None]:
# importing the dependencies

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset

data = pd.read_csv("/content/spam.csv", encoding = 'latin-1')

print(f"\nDimension of dataset = {data.shape} \n")

data.head()


Dimension of dataset = (5572, 5) 



Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
# checking if there any missing values & data types of columns

print(data.info())

# if there any missing values, count of them as per column

print(f"\nCount of missing values (if any) = \n")
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB
None

Count of missing values (if any) = 



v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [None]:
# there r no missing values in column - 1, 2
# maximum values of column - 3, 4, 5 are missing, so we will remove the columns

clean_data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

print(f"\nDimension of new dataset = {clean_data.shape} \n")


Dimension of new dataset = (5572, 2) 



In [None]:
# changing the column names

clean_data.rename(columns={'v1':'category', 'v2':'message'}, inplace=True)

clean_data.head()

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# changing categorical data to numerical but we need to know the unique values present in 'category'

print(f"\nUnique values in 'category' column = \n")
clean_data['category'].unique()


Unique values in 'category' column = 



array(['ham', 'spam'], dtype=object)

In [None]:
# label Encoding the 'category' column values
# 0 --> ham
# 1 --> spam

clean_data.replace({'category': {'ham':0, 'spam':1}}, inplace=True)

clean_data.head()

Unnamed: 0,category,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# **3. Modelling :-**

In [None]:
# separating the target column

X = clean_data['message']
Y = clean_data['category']

In [None]:
# splitting dataset into training & testing data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

print(f"\nX shape = {X.shape}\n")
print(f"\nX_train shape = {X_train.shape}\n")
print(f"\nX_test shape = {X_test.shape}\n")


X shape = (5572,)


X_train shape = (4457,)


X_test shape = (1115,)



In [None]:
# converting string to numerical of 'message' column

feat_extract = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_vector = feat_extract.fit_transform(X_train)
X_test_vector = feat_extract.transform(X_test)

In [None]:
# model building

model = LogisticRegression()

# training the model with training dataset

model.fit(X_train_vector, Y_train)

# **4. Evaluation :-**

In [None]:
# Using ACCURACY_SCORE()

# on training data

X_train_prediction = model.predict(X_train_vector)

training_accuracy = accuracy_score(X_train_prediction, Y_train)

print(f"\nAccuracy score on training dataset = {training_accuracy}")


Accuracy score on training dataset = 0.9685887368184878


In [None]:
# on testing data

X_test_prediction = model.predict(X_test_vector)

testing_accuracy = accuracy_score(X_test_prediction, Y_test)

print(f"\nAccuracy score on testing dataset = {testing_accuracy}")


Accuracy score on testing dataset = 0.9659192825112107


# **5. Deployment :-**

In [None]:
# user input

sms = input("\nEnter your message = ")

# converting the sms into series / list

sms_list = [sms]

# converting the string into feature vector

sms_vector = feat_extract.transform(sms_list)

# making prediction

prediction = model.predict(sms_vector)

# since, prediction is of array / list data type

if(prediction[0] == 0):
    print("\nOutput : Not Spam\n")
else:
    print("\nOutput : Spam\n")


Enter your message = call here for discount

Output : Not Spam



# **6. Maintenance :-**