**Import Dependencies**

In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer

**Import Dataset**

In [24]:
df= pd.read_csv('/content/mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [25]:
null_data= df.where((pd.notnull(df)),'')

In [26]:
null_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
null_data.shape

(5572, 2)

**Label Encoding**

In [28]:
# label spam mail as 0;  ham mail as 1;

null_data.loc[null_data['Category'] == 'spam', 'Category',] = 0
null_data.loc[null_data['Category'] == 'ham', 'Category',] = 1

Spam --- 0

 Ham/ Not Spam ---1

**Define Target & Features**

In [29]:
X = null_data['Message']

y = null_data['Category']

**Train Test Split**

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.2, random_state=40)

In [33]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


**Feature Extraction**

In [35]:
# Initialize the TfidfVectorizer with the desired parameters
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

# Transform the training and testing data
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert y_train and Y_test values to integers
# Ensure y_train and y_test are numpy arrays for compatibility with sklearn
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)

In [36]:
print(X_train)

15      XXXMobileMovieClub: To use your credit, click ...
1301    Those cocksuckers. If it makes you feel better...
3059    You are now unsubscribed all services. Get ton...
3123    Free entry in 2 a weekly comp for a chance to ...
1884    Come to me, slave. Your doing it again ... Goi...
                              ...                        
4722           Yup. Anything lor, if u dun wan it's ok...
3340    Babe !!!! I LOVE YOU !!!! *covers your face in...
5426        Oh yeah! And my diet just flew out the window
3064    Hi babe its Jordan, how r u? Im home from abro...
3398                         Heehee that was so funny tho
Name: Message, Length: 4457, dtype: object


In [37]:
print(X_train_features)

  (0, 5386)	0.2648741730301932
  (0, 1861)	0.16043711716232983
  (0, 3451)	0.19484618699592876
  (0, 4367)	0.1523073351138082
  (0, 6906)	0.13677171397395496
  (0, 4046)	0.21913101623812667
  (0, 7178)	0.4535483672374851
  (0, 1799)	0.4629246649360679
  (0, 2030)	0.21302806494999985
  (0, 7015)	0.17376566266367682
  (0, 7436)	0.5297483460603865
  (1, 7176)	0.2947088421443434
  (1, 1174)	0.22187863850377643
  (1, 3673)	0.3052852825204281
  (1, 4739)	0.3201919379958315
  (1, 3005)	0.3052852825204281
  (1, 7384)	0.3201919379958315
  (1, 3649)	0.3201919379958315
  (1, 1304)	0.2119415940005754
  (1, 2744)	0.4051896894228766
  (1, 4238)	0.24151382647434155
  (1, 1839)	0.3201919379958315
  (2, 6401)	0.336441667110054
  (2, 6932)	0.20570224914890112
  (2, 3135)	0.336441667110054
  :	:
  (4454, 2259)	0.4846951990418813
  (4454, 7302)	0.502089824313186
  (4454, 7459)	0.3133856557898305
  (4454, 4806)	0.2944546383615612
  (4454, 3785)	0.22527363139095744
  (4455, 102)	0.30455521325651647
  (4455,

**Model Building**

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [38]:
model = LogisticRegression()
model.fit(X_train_features, y_train)

**Model Evaluation**

In [46]:
prediction_on_training_data= model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)
accuracy_on_training_data


0.9694862014808167

**Model Prediction**

In [49]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)
accuracy_on_test_data

0.9641255605381166

**Build a Predictive System**

In [50]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail
