In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("mail_data.csv")

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
df.loc[df["Category"] == "ham" , "Label",] = 0
df.loc[df["Category"] == "spam" , "Label",] = 1

In [7]:
df.head()

Unnamed: 0,Category,Message,Label
0,ham,"Go until jurong point, crazy.. Available only ...",0.0
1,ham,Ok lar... Joking wif u oni...,0.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1.0
3,ham,U dun say so early hor... U c already then say...,0.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0.0


In [8]:
x = df["Message"]
y = df["Label"].convert_dtypes(int)

In [9]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Label, Length: 5572, dtype: Int64


In [10]:
df["Message"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [11]:
df["Message"] = df["Message"].apply(lambda x : x.lower())

In [12]:
df["Message"][0]

'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

## split train and test data

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size= 0.2 , random_state= 42)

## vectorization

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vector = TfidfVectorizer()

In [17]:
x_train = vector.fit_transform(x_train)
x_test = vector.transform(x_test)

## model

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
model = LogisticRegression()

In [20]:
model.fit(x_train , y_train)

## evaluating the model

In [21]:
from sklearn.metrics import accuracy_score , mean_squared_error
import numpy as np

In [22]:
# accuracy on test data
y_pred = model.predict(x_test)

In [23]:
accuracy_score(y_test , y_pred)

0.9748878923766816

In [24]:
mse = mean_squared_error(y_test , y_pred)

In [25]:
rmse = np.sqrt(mse)
rmse

0.15846800189097604

## save the model

In [26]:
from joblib import dump , load

In [27]:
dump(model , "EMail_Spam_Detection.joblib")

['EMail_Spam_Detection.joblib']

## test the model

In [28]:
user = pd.DataFrame({
    "text" : [input("Your email: ")]
})

Your email: free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's


In [29]:
user

Unnamed: 0,text
0,free entry in 2 a wkly comp to win fa cup fina...


In [30]:
user_data_feature = vector.transform(user["text"])

## use the model

In [31]:
my_model = load("EMail_Spam_Detection.joblib")

In [32]:
user["predicted"] = my_model.predict(user_data_feature)
user["predicted"] = user["predicted"].apply(lambda x : "Spam" if x == 1 else "Not Spam")

In [33]:
user

Unnamed: 0,text,predicted
0,free entry in 2 a wkly comp to win fa cup fina...,Spam
