# SPAM MAIL DETECTION 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('mail_data.csv')

In [3]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# DATA CLEANING

In [4]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


# LABLE ENCODING

In [7]:
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1


we convert the spam as 0 and ham as 1

# SPLITING THE DATA 

In [8]:
x=data['Message']
y=data['Category']

In [9]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [10]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [13]:
print(x_train.shape , y_train.shape)

(4457,) (4457,)


# FEATURE EXTRACTION
transform the object/test data into feature vectors for log reg

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [15]:
fea_ext = TfidfVectorizer(min_df=1, stop_words='english',lowercase=True)

In [16]:
x_train_feat=fea_ext.fit_transform(x_train)
x_test_feat=fea_ext.transform(x_test)

In [17]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [18]:
print(x_train_feat)

  (0, 4814)	0.5314159740771697
  (0, 1049)	0.4466950165631421
  (0, 1824)	0.36320812805612684
  (0, 4397)	0.45762271456933135
  (0, 3266)	0.42038307708259076
  (1, 4045)	0.19228025306881263
  (1, 3314)	0.19794043647383822
  (1, 6435)	0.2527892910607774
  (1, 3405)	0.31963523988193576
  (1, 1543)	0.2842397217019245
  (1, 6234)	0.2068487996738421
  (1, 7256)	0.22253477206479502
  (1, 5929)	0.3019374807919301
  (1, 1559)	0.33219199978791253
  (1, 6495)	0.3057353572485187
  (1, 4376)	0.34014997921580803
  (1, 4957)	0.20643895586902122
  (1, 6796)	0.36244651878389494
  (2, 1440)	0.4152964942359197
  (2, 2301)	0.2457625279087674
  (2, 1138)	0.29883510342625486
  (2, 4402)	0.3491921417300882
  (2, 633)	0.4152964942359197
  (2, 716)	0.4152964942359197
  (2, 1295)	0.39596221056073766
  :	:
  (4454, 2355)	0.2727244455430103
  (4454, 3975)	0.16232770622093343
  (4454, 2475)	0.27591707200219445
  (4454, 1121)	0.24996738666483556
  (4454, 3360)	0.2343770829958558
  (4454, 3927)	0.16812453587013215


# MODEL TRAINING

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
logstr = LogisticRegression()

In [21]:
logstr.fit(x_train_feat,y_train)

In [22]:
mail_detecftion = logstr.predict(x_test_feat)

In [23]:
mail_detecftion

array([1, 1, 0, ..., 0, 1, 1])

# BUILDING DETECTION SYSTEM 

In [35]:
enter_your_mail = [input("Enter your email: ")]

# Transform the email text using the same vectorizer you used for training
mail_feature = fea_ext.transform(enter_your_mail)

# Make predictions using the trained model (logstr) for prediction
mail_detection = logstr.predict(mail_feature)

if mail_detection[0] == 1:
    print('Ham mail')  # Non-spam (ham) email
else:
    print('Spam mail')



Enter your email: 07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow
Spam mail


# EVALUATING THE MODEL 


In [36]:
from sklearn.metrics import accuracy_score

In [38]:
pred_on_train=logstr.predict(x_train_feat)
accuracy_score_on_train = accuracy_score(y_train,pred_on_train)
accuracy_score_on_train

0.9674669059905766

In [39]:
pred_on_test=logstr.predict(x_test_feat)
accuracy_score_on_test = accuracy_score(y_test,pred_on_test)
accuracy_score_on_test

0.9605381165919282