In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [16]:
#loading data form csv file
raw_mail_data = pd.read_csv('mail_data.csv', encoding='cp1252')


In [17]:
print(raw_mail_data)

         v                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [18]:
#Replace null values with null
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [19]:
# Print 5 rows
mail_data.head()

Unnamed: 0,v,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
#len
mail_data.shape

(5572, 2)

In [31]:
print(mail_data.columns)


Index(['v', 'Message'], dtype='object')


In [33]:
# Label spam as 0, ham as 1
mail_data.loc[mail_data['v'] == 'spam', 'v'] = 0
mail_data.loc[mail_data['v'] == 'ham', 'v'] = 1

# Optional: convert column to integer type
mail_data['v'] = mail_data['v'].astype(int)


In [44]:
#separate data
X=mail_data['Message']
Y=mail_data['v']

In [35]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [45]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v, Length: 5572, dtype: int64


In [46]:
#Split into train and test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [47]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [48]:
#transform text to feature vector 
#score is more than one (min_df)
#TfidfVectirizer is used to give score to each word
#stop_word is used to ignore words like is,the,was,they ,etc
feature_extration=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [49]:
X_train_features=feature_extration.fit_transform(X_train)
X_test_features=feature_extration.transform(X_test)

#convert y value string to int
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [50]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.4131033779433779
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.3161058676607886
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.3303699595553702
  (1, 1991)	0.3303699595553702
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587516
  (3, 2870)	0.5864269879324768
  (3, 7415)	0.8100020912469564
  (4, 50)	0.23633754072626947
  (4, 5497)	0.15743785051118359
  :	:
  (4454, 4602)	0.26697657324453916
  (4454, 3142)	0.3201445167776316
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.3544154551183794
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4455

In [51]:
model=LogisticRegression()

In [52]:
model.fit(X_train_features,Y_train)

In [53]:
#prediction the training data
prediction_on_training_data=model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)
print(accuracy_on_training_data)

0.9670181736594121


In [54]:
#prediction the testing data
prediction_on_test_data=model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)
print(accuracy_on_test_data)

0.9659192825112107


In [55]:
input_main=["You'll not rcv any more msgs from the chat svc. For FREE Hardcore services text GO to: 69988 If u get nothing u must Age Verify with yr network & try again"]

#convert text to feature_extration (int)
input_data_features=feature_extration.transform(input_main)

#make prediction
prediction =model.predict(input_data_features)



if (prediction ==0):
    print("SPAM MAIL")
else:
   print("HAM MAIL")

SPAM MAIL


In [56]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import datetime

# Data Collection & Pre-Processing
# Load data from csv file
raw_mail_data = pd.read_csv('mail_data.csv', encoding='cp1252')

# Replace null values with empty string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

# Label spam as 0, ham as 1
mail_data.loc[mail_data['v'] == 'spam', 'v'] = 0
mail_data.loc[mail_data['v'] == 'ham', 'v'] = 1

# Convert column to integer type
mail_data['v'] = mail_data['v'].astype(int)

# Separate the data into features and target
X = mail_data['Message']
Y = mail_data['v']

# Split into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

# Feature Extraction - convert text to numerical features
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y values to int
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

# Training Model - Logistic Regression
model = LogisticRegression()
model.fit(X_train_features, Y_train)

# Evaluation of the training model
# Prediction on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print(f"Accuracy on training data: {accuracy_on_training_data}")

# Prediction on testing data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print(f"Accuracy on test data: {accuracy_on_test_data}")

# Save the trained model to a .pkl file
with open('spam_classifier_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the feature extraction vectorizer as well
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(feature_extraction, file)

# Save model metadata with version information
model_info = {
    'model': model,
    'vectorizer': feature_extraction,
    'version': '1.0',
    'training_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'accuracy': accuracy_on_test_data
}

with open('spam_classifier_complete.pkl', 'wb') as file:
    pickle.dump(model_info, file)

print("Model and vectorizer saved successfully!")

# Building a predictive system
def predict_email(input_mail):
    # Convert text to feature vector
    input_data_features = feature_extraction.transform([input_mail])
    
    # Make prediction
    prediction = model.predict(input_data_features)
    
    if prediction[0] == 0:
        return "SPAM MAIL"
    else:
        return "HAM MAIL"

# Test the predictive system
test_messages = [
    "You'll not rcv any more msgs from the chat svc. For FREE Hardcore services text GO to: 69988 If u get nothing u must Age Verify with yr network & try again",
    "Hey, are we still meeting for lunch tomorrow?",
    "Congratulations! You've won a $1000 Walmart gift card. Click here to claim your prize now!",
    "Hi, just checking in to see how you're doing. Let's catch up soon."
]

for i, message in enumerate(test_messages, 1):
    result = predict_email(message)
    print(f"Test {i}: {result}")
    print(f"Message: {message}\n")

Accuracy on training data: 0.9670181736594121
Accuracy on test data: 0.9659192825112107
Model and vectorizer saved successfully!
Test 1: SPAM MAIL
Message: You'll not rcv any more msgs from the chat svc. For FREE Hardcore services text GO to: 69988 If u get nothing u must Age Verify with yr network & try again

Test 2: HAM MAIL
Message: Hey, are we still meeting for lunch tomorrow?

Test 3: SPAM MAIL
Message: Congratulations! You've won a $1000 Walmart gift card. Click here to claim your prize now!

Test 4: HAM MAIL
Message: Hi, just checking in to see how you're doing. Let's catch up soon.

