# Step 1 : Business Problem Understanding
**create a ML model which can predict the given mail is spam or ham**

# Step 2 : Data Understanding

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.shape

(5572, 2)

In [6]:
df.loc[df['Category']== 'spam', 'Category',] = 0
df.loc[df['Category']== 'ham', 'Category',] = 1

In [7]:
x = df['Message']
y = df['Category']

In [8]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [9]:
df = df.where((pd.notnull(df)), '')
#df = df.where(pd.notnull(df), None)

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=3)

In [11]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


In [12]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(4457,)
(1115,)


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(x_train)
X_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [14]:
print(x_train)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object


In [15]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Convert all labels to strings (if not already)
y_train_str = y_train.astype(str)
y_test_str = y_test.astype(str)

# Fit and transform labels in y_train and y_test
y_train_encoded = label_encoder.fit_transform(y_train_str)
y_test_encoded = label_encoder.transform(y_test_str)

# Convert to integers
y_train_int = y_train_encoded.astype(int)
y_test_int = y_test_encoded.astype(int)


In [16]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

# Model 1 - Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_features, y_train)

In [18]:
from sklearn.metrics import accuracy_score
prediction_on_training_data = lr.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [19]:
print("Accuracy on training data : ", accuracy_on_training_data)

Accuracy on training data :  0.9676912721561588


In [20]:
prediction_on_testing_data = lr.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)

In [21]:
print("Accuracy on testing data : ", accuracy_on_testing_data)

Accuracy on testing data :  0.9668161434977578


# Predicting with New Data

In [22]:
input_your_mail = ["hi how r u"]
input_data_features = feature_extraction.transform(input_your_mail)
prediction = lr.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


In [23]:
input_your_mail = ["Dear Learner, Are you ready to *supercharge* your AI abilities? Join our exclusive Mastering Generative AI course and become a master of cutting-edge techniques."]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = lr.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


In [24]:
input_your_mail = ["This is the 2nd time we have tried to contact you. u have won prize $10000"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = lr.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[0]
spam


# Model 2 - Decision Tree Classifier

In [25]:
from sklearn.tree import DecisionTreeClassifier
model_Dt = DecisionTreeClassifier()
model_Dt.fit(X_train_features, y_train)

In [26]:
from sklearn.metrics import accuracy_score
prediction_on_training_data = model_Dt.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [27]:
print("Accuracy on training data : ", accuracy_on_training_data)

Accuracy on training data :  1.0


In [28]:
input_your_mail = ["you have won prize of $10000"]
input_data_features = feature_extraction.transform(input_your_mail)
prediction = model_Dt.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[0]
spam


In [29]:
input_your_mail = ["This is the second time we have tried to contact you. u have won prize $10000"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = model_Dt.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[0]
spam


In [30]:
input_your_mail = ["I have been calling to you since hour, where r u"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = model_Dt.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


In [31]:
input_your_mail = ["""Hello Friends!

Choose the correct answer for the given question and win big prize of trip to visit paris...

Q) Virat kohli plays for which team in IPL?

A. Royal challengers Banglore
B) Chennai super kings
C) Sunrises Hyderabad

if you submitted answer today before 6:00 pm
Bonus Special Offer also applied!

Coupan of worth Rupees 3000 will be receive =
20% Off & FREE SHIPPING!

Hurry up!!!"""]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = model_Dt.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[0]
spam


# Model 3 - Random Forest Classifier

In [32]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train_features, y_train)

In [33]:
from sklearn.metrics import accuracy_score
prediction_on_training_data = rf_model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [34]:
print("Accuracy on training data : ", accuracy_on_training_data)

Accuracy on training data :  1.0


In [35]:
prediction_on_testing_data = rf_model.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)

In [36]:
print("Accuracy on testing data : ", accuracy_on_testing_data)

Accuracy on testing data :  0.9757847533632287


In [37]:
input_your_mail = ["This is the second time we have tried to contact you. u have won prize $10000"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = rf_model.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[0]
spam


In [38]:
input_your_mail = ["Congratulations! you have won the prize coupan of rupees 1 Crore. Submit your Bank details given below link"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = rf_model.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[0]
spam


In [39]:
input_your_mail = ["Your profile has been shortlisted for ABC company. Our HR will contact you soon please check out mails and if you have any querries please ask"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = rf_model.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


In [40]:
input_your_mail = ["90% of you data used on 1-1-2023. jio number 9876543210"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = rf_model.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


# Model 4 : Support vector machine
## Support vector classifier

In [41]:
from sklearn.svm import SVC
# Train an SVC model
svm = SVC()
svm.fit(X_train_features, y_train)

In [42]:
# Make predictions on the test set
y_pred = svm.predict(X_test_features)
y_pred

array([0, 1, 0, ..., 1, 1, 1])

In [43]:
from sklearn.metrics import accuracy_score
prediction_on_training_data = svm.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [44]:
print("Accuracy on Training data : ", accuracy_on_training_data)

Accuracy on Training data :  0.99798070450976


In [45]:
prediction_on_testing_data = svm.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)

In [46]:
print("Accuracy on Testing data : ", accuracy_on_testing_data)

Accuracy on Testing data :  0.979372197309417


In [47]:
input_your_mail = ["This is the second time we have tried to contact you. u have won prize $10000"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = svm.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[0]
spam


In [48]:
input_your_mail = ["Your profile has been shortlisted for MNC company. Register your details with the link provided in the message and wait for Our HR will contact you soon and if you have any querries please write us"]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = svm.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


# Model 5 : KNN - K nearest neighbour

In [49]:
from sklearn.neighbors import KNeighborsClassifier
# Train a KNN model
knn = KNeighborsClassifier()
knn.fit(X_train_features, y_train)

In [50]:
# Make predictions on the test set
y_pred = knn.predict(X_test_features)
y_pred

[WinError 2] The system cannot find the file specified
  File "C:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\ProgramData\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


array([0, 1, 0, ..., 1, 1, 1])

In [51]:
from sklearn.metrics import accuracy_score
prediction_on_training_data = knn.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [52]:
print("Accuracy on Training data : ", accuracy_on_training_data)

Accuracy on Training data :  0.9201256450527261


In [53]:
prediction_on_testing_data = knn.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)

In [54]:
print("Accuracy on Testing data : ", accuracy_on_testing_data)

Accuracy on Testing data :  0.9094170403587444


In [55]:
input_your_mail = ["""Hello Friends!

Choose the correct answer for the given question and win big prize of trip to visit paris...

Q) Virat kohli plays for which team in IPL?

A. Royal challengers Banglore
B) Chennai super kings
C) Sunrises Hyderabad

if you submitted answer today before 6:00 pm
Bonus Special Offer also applied!

Coupan of worth Rupees 3000 will be receive =
20% Off & FREE SHIPPING!

Hurry up!!!"""]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = knn.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


In [56]:
input_your_mail = ["""your recharge is finished"""]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = knn.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


In [57]:
input_your_mail = ["""you won the lottery prize of $ 1,00,000. Please signin below provided link and fill bank details"""]

input_data_features = feature_extraction.transform(input_your_mail)
prediction = knn.predict(input_data_features)

print(prediction)

if(prediction[0]==1):
  print('ham')
else:
  print('spam')

[1]
ham


# Model is predicting well
## Every Algorithm working well but Random forest and Decision tree model's predicting well with 100% accuracy

# Saving the trained model

In [58]:
import pickle

In [60]:
filename = 'rf_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))

In [61]:
# loading the saved model
loaded_model = pickle.load(open('rf_model.sav', 'rb'))

In [65]:
# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(feature_extraction, vectorizer_file)

In [62]:
filename = 'dt_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))

In [63]:
filename = 'knn_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))

In [64]:
filename = 'svm_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))