# Importing Library 

In [3]:
import numpy as np  # Importing NumPy for numerical operations
import pandas as pd  # Importing Pandas for handling datasets
from sklearn.model_selection import train_test_split  # Function to split data into training and testing sets
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF Vectorizer for text feature extraction
from sklearn.linear_model import LogisticRegression  # Logistic Regression model for classification
from sklearn.metrics import accuracy_score  # Function to evaluate model performance

# Data Collection & Pre-Processing

In [5]:
# Loading the data from CSV file into a Pandas DataFrame
mail_df = pd.read_csv('mail_data.csv')  # Reading dataset from CSV file
mail_df  # Displaying the DataFrame

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
# Replacing null values with an empty string
mail_data = mail_df.where((pd.notnull(mail_df)), '')  # Handling missing values
# Display the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

# Start Label Encoding

In [9]:
# Label spam mail as 0 and ham mail as 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0  # Assigning 0 to spam emails
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1  # Assigning 1 to ham emails

### spam - 0
### ham - 1

In [11]:
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


# Separating the data into texts and labels

In [13]:
X = mail_data['Message']  # Feature: Email message content
y = mail_data['Category']  # Target: Spam (0) or Ham (1) labels

In [14]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [15]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

# Splitting the data into training data & test data

In [17]:
# Splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=3)  # Ensuring reproducibility

# Displaying the shape of the dataset and splits
X.shape, X_train.shape, X_test.shape  # Checking dimensions of original and split data

((5572,), (4457,), (1115,))

In [18]:
X_train

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object

In [19]:
y_test

2632    0
454     1
983     0
1282    1
4610    1
       ..
4827    1
5291    1
3325    1
3561    1
1136    1
Name: Category, Length: 1115, dtype: object

# Feature Extraction

In [21]:
# Transform the text data into feature vectors for model input
f_extra = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)  # TF-IDF Vectorizer with stop words removed
f_extra

In [22]:
X_train_features = f_extra.fit_transform(X_train)  # Fit and transform training data
X_test_features = f_extra.transform(X_test)  # Transform test data

In [23]:
X_train_features

<4457x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 34775 stored elements in Compressed Sparse Row format>

In [24]:
X_test_features

<1115x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 7687 stored elements in Compressed Sparse Row format>

In [25]:
# Convert target labels to integer type
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [26]:
y_train

3075    1
1787    1
1614    1
4304    1
3266    0
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 4457, dtype: int32

In [27]:
y_test 

2632    0
454     1
983     0
1282    1
4610    1
       ..
4827    1
5291    1
3325    1
3561    1
1136    1
Name: Category, Length: 1115, dtype: int32

# Training the Model

In [29]:
# Initializing the Logistic Regression model
LR = LogisticRegression()
LR  # Displaying the model instance

In [30]:
# Training the Logistic Regression model with the training data
LR.fit(X_train_features, y_train)  # Fitting the model to the training data

# Evaluate the trained model.

In [32]:
# Prediction on training data
pred_train_data = LR.predict(X_train_features)  # Predicting labels for training data
accu_train_data = accuracy_score(y_train, pred_train_data)  # Calculating accuracy on training data

In [33]:
pred_train_data

array([1, 1, 1, ..., 1, 1, 0])

In [34]:
print('Accuracy on training data : ', accu_train_data)

Accuracy on training data :  0.9676912721561588


In [35]:
# Prediction on test data
pred_test_data = LR.predict(X_test_features)  # Predicting labels for test data
accu_test_data = accuracy_score(y_test, pred_test_data)  # Calculating accuracy on test data

In [36]:
pred_test_data

array([0, 1, 1, ..., 1, 1, 1])

In [37]:
print('Accuracy on test data : ', accu_test_data)  # Displaying test accuracy

Accuracy on test data :  0.9668161434977578


# Creat a Predictive System

In [39]:
# Making a prediction on a sample input email
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# Convert text to feature vectors
input_features = f_extra.transform(input_mail)  # Transforming input email using trained vectorizer

# Making prediction
predict = LR.predict(input_features)  # Predicting whether the email is spam or ham
print(predict)  # Displaying the raw prediction result

# Interpreting and printing the result
if (predict[0] == 1):
    print('Ham mail ✅')  # Output if the email is classified as ham
else:
    print('Spam mail ❌')  # Output if the email is classified as spam

[1]
Ham mail ✅


In [40]:
# Making predictions on sample input emails
input_mail = ["Text & meet someone sexy today. U can find a date or even flirt its up to U. Join 4 just 10p. REPLY with NAME & AGE eg Sam 25. 18 -msg recd@thirtyeight pence"]

# Convert text to feature vectors
input_features = f_extra.transform(input_mail)  # Transforming input emails using trained vectorizer

# Making predictions
predict = LR.predict(input_features)  # Predicting whether the emails are spam or ham
print(predict)  # Displaying the raw prediction results

# Interpreting and printing the result
if (predict[0] == 1):
    print('Ham mail ✅')  # Output if the email is classified as ham
else:
    print('Spam mail ❌')  # Output if the email is classified as spam

[0]
Spam mail ❌


In [41]:
# Save