In [79]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn

In [81]:
print("pandas:",pd.__version__)
print("numpy:",np.__version__)
print("sklearn:",sklearn.__version__)
print("matplotlib:",matplotlib.__version__)

pandas: 2.2.2
numpy: 1.26.4
sklearn: 1.4.2
matplotlib: 3.8.4


In [2]:
dataset=pd.read_csv('mail_data.csv')

In [3]:
dataset.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data Pre-Processing

In [5]:
 # checking for null values

In [6]:
is_Null1=dataset['Category'].isnull().any()

In [7]:
is_Null2=dataset['Message'].isnull().any()

In [8]:
print(is_Null1,is_Null2)

False False


In [9]:
# shape

In [10]:
dataset.shape

(5572, 2)

## Label Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
le=LabelEncoder()
dataset['Category']=le.fit_transform(dataset['Category'])

In [14]:
dataset.head(5)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## spam = 1 ,ham = 0

In [16]:
input_data=dataset['Message']
output_data=dataset['Category']

## Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test=train_test_split(input_data,output_data,test_size=0.2,random_state=42)

In [20]:
print(x_train.shape, x_test.shape)

(4457,) (1115,)


# Feature Extraction

In [22]:
# map the text data to feature vectors
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [24]:
x_train_feature=feature_extraction.fit_transform(x_train)
x_test_feature=feature_extraction.transform(x_test)

In [25]:
# Get the vocabulary dictionary (term to index mapping)
vocab = feature_extraction.vocabulary_

# Find the term corresponding to index 5818
term = [key for key, value in vocab.items() if value == 5818][0]

print(f"Term at index 5818: {term}")


Term at index 5818: service


# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
lr=LogisticRegression()

In [29]:
lr.fit(x_train_feature,y_train)

In [65]:
print("Training Accuracy:",lr.score(x_train_feature,y_train)*100)
print("Testing Accuracy:",lr.score(x_test_feature,y_test)*100)

Training Accuracy: 96.70181736594121
Testing Accuracy: 96.7713004484305


## Predict the output

In [32]:
input_mail=["New TEXTBUDDY Chat 2 horny guys in ur area 4 just 25p Free 2 receive Search postcode or at gaytextbuddy.com. TXT ONE name to 89693. 08715500022 rpl Stop 2"]

In [33]:
input_mail_feature=feature_extraction.transform(input_mail)

In [34]:
lr.predict(input_mail_feature)

array([1])

In [83]:
# Exporting the model
import pickle 
pickle.dump(lr,open('lr.pkl','wb'))

In [87]:
# Save label Encoder & Feature Extraction
with open('Label_Encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
with open('Feature_Extraction.pkl', 'wb') as f:
    pickle.dump(feature_extraction, f)    