# Loading the Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Collection and Preprocessing

### Loading the Data 

In [2]:
data=pd.read_csv("mail_data.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


### Checking the shape

In [4]:
data.shape

(5572, 2)

In [5]:
data.columns

Index(['Category', 'Message'], dtype='object')

### Checking the missiing values

In [6]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
data.isnull().sum().sum()

0

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


**Insights**
- There is no missing data in the dataset

### Label Encoding 
ham  → 1

spam →0


In [24]:
data.replace({'Category':{'ham':1, 'spam':0,}}, inplace = True)

  data.replace({'Category':{'ham':1, 'spam':0,}}, inplace = True)


In [25]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


### Split the data into target and input 

In [26]:
X=data["Message"]
y=data["Category"]

In [27]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [29]:
y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: int64

In [43]:
y.dtype

dtype('int64')

### Split the data into training and Testing dataset

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [38]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [39]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(4457,)
(1115,)


## Feature Extraction

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [44]:
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

In [45]:
print(X_train)

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: Message, Length: 4457, dtype: object


In [46]:
print(X_train_features)

  (0, 5512)	0.1898892037332199
  (0, 7222)	0.2173884735352799
  (0, 258)	0.2379428657041507
  (0, 7162)	0.2550284465664535
  (0, 354)	0.3544175987866074
  (0, 2724)	0.3544175987866074
  (0, 7300)	0.24288153842988894
  (0, 2049)	0.3034375179183143
  (0, 3262)	0.33791755486732394
  (0, 5800)	0.17558937755823417
  (0, 6264)	0.1898892037332199
  (0, 694)	0.3171299579602537
  (0, 2497)	0.2442158912653505
  (0, 5818)	0.22682143517864364
  (1, 3267)	0.26787130770292167
  (1, 6109)	0.32397626344658004
  (1, 6738)	0.28986069568917994
  (1, 2335)	0.21623212751660786
  (1, 5650)	0.3604441444703179
  (1, 3333)	0.20665394084233094
  (1, 2440)	0.3387054464839871
  (1, 4509)	0.40282459910606705
  (1, 3932)	0.24325511357721422
  (1, 3804)	0.19029023465152678
  (1, 2555)	0.3840709491751003
  :	:
  (4452, 3084)	0.22948428918295163
  (4452, 3290)	0.26370969643076225
  (4452, 3978)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 2438)	0.4574160733416501
  (4452, 7264)	0.479751063610169
  (44

# Training the Model
### Logistic regreesion

In [48]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

In [49]:
model.fit(X_train_features,y_train)

In [59]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train metrics
X_train_pred = model.predict(X_train_features)

print("Train Accuracy :", accuracy_score(y_train, X_train_pred))
print("Train Precision:", precision_score(y_train, X_train_pred))
print("Train Recall   :", recall_score(y_train, X_train_pred))
print("Train F1 Score :", f1_score(y_train, X_train_pred))


# Test metrics
X_test_pred = model.predict(X_test_features)

print("Test Accuracy  :", accuracy_score(y_test, X_test_pred))
print("Test Precision :", precision_score(y_test, X_test_pred))
print("Test Recall    :", recall_score(y_test, X_test_pred))
print("Test F1 Score  :", f1_score(y_test, X_test_pred))


Train Accuracy : 0.9670181736594121
Train Precision: 0.964232116058029
Train Recall   : 0.9989634620367971
Train F1 Score : 0.9812905689194349
Test Accuracy  : 0.967713004484305
Test Precision : 0.9640718562874252
Test Recall    : 1.0
Test F1 Score  : 0.9817073170731707


# Model Performance Summary

| Metric    | Train Data | Test Data |
| --------- | ---------- | --------- |
| Accuracy  | 0.9670     | 0.9677    |
| Precision | 0.9642     | 0.9641    |
| Recall    | 0.9990     | 1.0000    |
| F1 Score  | 0.9813     | 0.9817    |


# Hyperparameter tuning

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define model
lr = LogisticRegression(max_iter=1000)

# Hyperparameters to tune
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

# Grid Search
grid = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

# Fit on training data
grid.fit(X_train_features, y_train)

# Best model
best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)

Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}


In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train performance
train_pred = best_model.predict(X_train_features)
print("Train Accuracy :", accuracy_score(y_train, train_pred))
print("Train Precision:", precision_score(y_train, train_pred))
print("Train Recall   :", recall_score(y_train, train_pred))
print("Train F1 Score :", f1_score(y_train, train_pred))

# Test performance
test_pred = best_model.predict(X_test_features)
print("Test Accuracy  :", accuracy_score(y_test, test_pred))
print("Test Precision :", precision_score(y_test, test_pred))
print("Test Recall    :", recall_score(y_test, test_pred))
print("Test F1 Score  :", f1_score(y_test, test_pred))

Train Accuracy : 1.0
Train Precision: 1.0
Train Recall   : 1.0
Train F1 Score : 1.0
Test Accuracy  : 0.9901345291479821
Test Precision : 0.9897435897435898
Test Recall    : 0.9989648033126294
Test F1 Score  : 0.994332818134982


# Model Performance after hyperparameter tuning

| Metric    | Train Data | Test Data  |
| --------- | ---------- | ---------- |
| Accuracy  | **1.0000** | **0.9901** |
| Precision | **1.0000** | **0.9897** |
| Recall    | **1.0000** | **0.9990** |
| F1 Score  | **1.0000** | **0.9943** |


### Testing the model

In [69]:
def predict_email(email, vectorizer, model):
    email_features = vectorizer.transform([email])
    
    prediction = model.predict(email_features)[0]
    
    
    if prediction == 1:
        result = "Ham"
          # spam = 0
    else:
        result = "Spam"
        
    
    return result

In [71]:
email = "Congratulations! You have won a free lottery prize"

result= predict_email(email, feature_extraction, model)

print("Prediction:", result)


Prediction: Spam


In [72]:
email = "Yup next stop."

result = predict_email(email, feature_extraction, model)

print("Prediction:", result)

Prediction: Ham
