In [None]:
##import numpy as np: Imports NumPy for numerical operations (like handling arrays).
##import pandas as pd: Imports Pandas for data manipulation (like working with tables/dataframes).
##from sklearn.model_selection import train_test_split: Imports a function to split your data into training and testing sets.
##from sklearn.feature_extraction.text import TfidfVectorizer: Imports a tool to convert text into numerical features using TF-IDF.
##from sklearn.linear_model import LogisticRegression: Imports the Logistic Regression model for classification.
##from sklearn.metrics import accuracy_score: Imports a function to calculate the accuracy of your model.

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [10]:
df=pd.read_csv('mail_data.csv')

In [None]:
##This line reads your email dataset from a file named "mail_data.csv" and stores it in a Pandas DataFrame called df

In [11]:
print(df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [None]:
##This line displays the contents of your email dataset (the df DataFrame) in your console. You'll see the rows and columns of your data,
##showing the emails and their labels (spam or ham)

In [12]:
data=df.where((pd.notnull(df)), '')

In [None]:
##This line handles missing values in your dataset. It replaces any "null" (empty) cells in your DataFrame df with empty strings (''). 
##This ensures your data is clean and consistent for further processing.

In [16]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [None]:
##This displays the first 10 rows of your cleaned email data (data). 
##It's a quick way to inspect the data and make sure it looks as expected after handling missing values
## If u wan to show all rows and columns of dataset then nothing put in bracket, As an example i am using first 10 rows and columns of dataset

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
##This line provides a summary of your DataFrame data. It shows:
        ##The number of rows and columns.
        ##The data types of each column (e.g., text, numbers).
        ##The number of non-null values in each column.
##It's a quick way to get an overview of your data's structure and any potential issues.

In [19]:
data.shape

(5572, 2)

In [None]:
##This line tells you the dimensions of your dataset (the data DataFrame).
##It outputs a tuple: (number of rows, number of columns). This helps you quickly understand the size of your data.

In [20]:
data.loc[data['Category']== 'spam','Category',] = 0 
data.loc[data['Category']== 'ham','Category',] = 1 

In [None]:
##These two lines convert the email category labels ("spam" and "ham") into numerical values:
        ##"spam" is replaced with 0.
        ##"ham" is replaced with 1.
##This is necessary because machine learning models work with numbers, not text.

In [21]:
X=data['Message']
Y=data['Category']

In [None]:
##These lines separate your data into features (X) and labels (Y):
       ##X = data['Message']: X now contains the email messages (the text we'll use to predict spam).
       ##Y = data['Category']: Y now contains the corresponding labels (0 for spam, 1 for ham).

In [22]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [None]:
##This line prints the email messages (the X variable) to your console.
##You'll see the raw text of the emails, which will be used as input for your spam detection model.

In [23]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [None]:
##This line prints the email category labels (the Y variable) to your console. 
##You'll see a series of 0s and 1s, representing "spam" and "ham" respectively, corresponding to each email in your dataset.

In [24]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y, test_size=0.2,random_state=3)

In [None]:
##This line splits your data into training and testing sets:
        ##X_train, Y_train: Used to train your model.
        ##X_test, Y_test: Used to evaluate how well your model performs on unseen data.
        ##test_size=0.2: 20% of your data is used for testing, 80% for training.
        ##random_state=3: Ensures the split is the same every time you run the code, for reproducibility.

In [25]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [None]:
##These lines print the shapes (dimensions) of your data:
         ##print(X.shape): Shows the shape of the original email messages (all data).
         ##print(X_train.shape): Shows the shape of the training email messages.
         ##print(X_test.shape): Shows the shape of the testing email messages.
##This helps you verify that the data has been split correctly and see the sizes of your training and testing sets.

In [26]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(4457,)
(1115,)


In [None]:
##These lines print the shapes (dimensions) of your email category labels:
          ##print(Y.shape): Shows the shape of the original labels (all data).
          ##print(Y_train.shape): Shows the shape of the training labels.
          ##print(Y_test.shape): Shows the shape of the testing labels.
##This ensures that the labels have been split consistently with the email messages.

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True) 
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test) 

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
##This code converts the email text into numerical features using TF-IDF and ensures the labels are integers:
      ##TfidfVectorizer(...):
            ##Creates a tool to convert text to numbers using TF-IDF.
            ##min_df=1: Ignores terms that appear in less than 1 document.
            ##stop_words='english': Removes common English words ("the," "a," etc.).
            ##lowercase=True: Converts all text to lowercase.
      ##feature_extraction.fit_transform(X_train):
            ##Learns the vocabulary and TF-IDF weights from the training emails (X_train) and transforms them into numerical features (X_train_features).
      ##feature_extraction.transform(X_test):
            ##Uses the already learned vocabulary and weights to transform the testing emails (X_test) into numerical features (X_test_features). 
            ##This is very important. You don't want to refit the vectorizer on test data.
      ##Y_train = Y_train.astype('int') & Y_test = Y_test.astype('int'):
            ##Converts the email category labels (Y_train, Y_test) to integers (0 or 1), which is necessary for the machine learning model.

In [34]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [None]:
##This line prints the numerical representation of your training email messages (X_train_features).
##You'll see a sparse matrix, where each row represents an email, and each column represents a word. 
##The values in the matrix are the TF-IDF scores, showing how important each word is to each email.

In [35]:
model = LogisticRegression()

In [None]:
##This line creates an instance of the Logistic Regression model, ready to be trained on your data.

In [36]:
model.fit(X_train_features,Y_train)

In [None]:
##This line trains your Logistic Regression model:
          ##It uses the numerical features of the training emails (X_train_features) and 
          ##their corresponding labels (Y_train) to learn the patterns that distinguish spam from ham.

In [37]:
prediction_on_training_data=model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [None]:
##These lines evaluate the model's performance on the training data:
           ##prediction_on_training_data = model.predict(X_train_features): 
                       ##The model predicts the labels for the training emails.
           ##accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data):  
                       ##It calculates how often the model's predictions match the actual training labels, giving you the training accuracy.

In [38]:
print("Account on training data :",accuracy_on_training_data)

Account on training data : 0.9670181736594121


In [None]:
##This line prints the accuracy of your model on the training data.
##It tells you how well the model learned to classify spam and ham emails from the data it was trained on.

In [39]:
prediction_on_test_data=model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)

In [None]:
##These lines evaluate the model's performance on the unseen test data:
            ##prediction_on_test_data = model.predict(X_test_features): 
                     ##The model predicts the labels for the test emails.
            ##accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data): 
                     ##It calculates how often the model's predictions match the actual test labels, giving you the test accuracy. 
                     ##This is the crucial metric for evaluating how well your model generalizes.

In [40]:
print("Account on test data :",accuracy_on_test_data)

Account on test data : 0.9659192825112107


In [None]:
##This line prints the accuracy of your model on the test data. This is the most important metric.
##It shows how well your spam detection model performs on emails it hasn't seen before, giving you a realistic idea of its real-world effectiveness.

In [45]:
input_mail=["U dun say so early hor... U c already then say..."] #input given from mail_data_csv file 
input_data_features=feature_extraction.transform(input_mail)
prediction=model.predict(input_data_features)
print(prediction)
if(prediction[0]==1):
    print("Ham mail")
else:
    print("Spam mail")


[1]
Ham mail


In [48]:
input_mail=["URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"] #input given from mail_data_csv file 
input_data_features=feature_extraction.transform(input_mail)
prediction=model.predict(input_data_features)
print(prediction)
if(prediction[0]==1):
    print("Ham mail")
else:
    print("Spam mail")

[0]
Spam mail


In [None]:
##This code snippet takes a new email, converts it to numerical features, uses your trained model to predict if it's spam or ham, 
##and then prints the result.

##Breakdown:

##input_mail = ["U dun say so early hor... U c already then say..."]:
         ##Creates a list containing the email you want to classify.
##input_data_features = feature_extraction.transform(input_mail):
         ##Uses the same TF-IDF vectorizer you trained earlier to convert the input email into numerical features.
##prediction = model.predict(input_data_features):
         ##Uses your trained Logistic Regression model to predict whether the email is spam (0) or ham (1).
##print(prediction):
         ##Prints the prediction (0 or 1).
##if (prediction[0] == 1): ... else: ...:
    ##Checks the prediction:
         ##If it's 1, it prints "Ham mail."
         ##If it's 0, it prints "Spam mail."