In [40]:
#import numpy library
import numpy as np

#import pandas library
import pandas as pd

#for spliting the dataset import train test split from scikit learn library
from sklearn.model_selection import train_test_split

#TfidfVectorizer class from scikit-learn for text feature extraction in the Python code
from sklearn.feature_extraction.text import TfidfVectorizer

#import logistic regression model algorithm for this project 
from sklearn.linear_model import LogisticRegression

#for evaluating the accuracy of this machine learning model import accuracy scores 
from sklearn.metrics import accuracy_score

In [41]:
#load the csv file for training and testing our model
df = pd.read_csv('mail_data.csv')

In [42]:
#ham means normal,spam means spam
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [43]:
#to replace missing (NaN) values with empty strings in a DataFrame
data = df.where((pd.notnull(df)),'')

In [44]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [46]:
data.shape

(5572, 2)

In [47]:
# .loc is used for selecting and updating specific rows and columns in a pandas DataFrame

#category column to 0 for rows where the 'category' is 'spam' in a pandas DataFrame
data.loc[data['Category'] == 'spam', 'Category'] = 0

#category column to 1 for rows where the 'category' is 'ham' in a pandas DataFrame
data.loc[data['Category'] == 'ham', 'Category'] = 1

In [48]:
#choose the dependent and independent variable
#depending on message we find which is spam or ham
x = data['Message']   #independent varuable
y = data['Category']  #dependent variable

In [49]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [50]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [51]:
#split the dataset into training and testing set 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 3)

In [52]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


In [53]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(4457,)
(1115,)


In [54]:
# The TfidfVectorizer from sklearn with certain parameters for feature extraction
# min_df = 1: which means all words that appear in at least one document will be included.

#stop_words = 'english': common English stop words should be removed from the text before vectorization. 
#Stop words are very common words like "and," "the," "in," etc.

#lowercase = True: This parameter specifies that all text should be converted to lowercase before vectorization.
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

# Fit and transform the training data
x_train_features = feature_extraction.fit_transform(x_train)

# Transform the test data using the same vectorizer
x_test_features = feature_extraction.transform(x_test)

#.astype('int') is a method that casts the data type of the array or Series to integer.
#if your original target labels were in string format ('0', '1', '2', etc.), this code would convert them to integers (0, 1, 2, etc.)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [55]:
print(x_train)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object


In [56]:
print(x_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [57]:
#create the instance of logistic regression
model = LogisticRegression()

In [58]:
#fit the model
model.fit(x_train_features, y_train)

In [59]:
#train the model and check accuracy
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [60]:
print(accuracy_on_training_data)

0.9670181736594121


In [61]:
#predict against test data set and check accuracy
prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [62]:
print(accuracy_on_test_data)

0.9659192825112107


In [63]:
#input any email message to check it is spam or not
input_data = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

input_data_features = feature_extraction.transform(input_data)

prediction = model.predict(input_data_features)

if prediction[0] ==1 :
    print('ham')
else :
    print('spam')

spam
