In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score , confusion_matrix ,classification_report

In [3]:
 mail=pd.read_csv('mail_data.csv')

In [4]:
mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
mail.shape

(5572, 2)

In [6]:
mail['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [7]:
le = LabelEncoder()

In [14]:
le.fit(mail['Category'])

LabelEncoder()

In [15]:
mail['Category']=le.transform(mail['Category'])

In [16]:
mail

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [17]:
mail['Category'].value_counts()

0    4825
1     747
Name: Category, dtype: int64

In [18]:
ham = mail[mail['Category'] == 0]

In [19]:
spam = mail[mail['Category'] == 1]

In [20]:
ham_s = ham.sample(n=747)

In [21]:
ham_s

Unnamed: 0,Category,Message
3569,0,Sent me de webadres for geting salary slip
3545,0,Lol ok ill try to send. Be warned Sprint is de...
4071,0,Ok i wont call or disturb any one. I know all ...
1383,0,Its ok my arm is feeling weak cuz i got a shot...
283,0,Ok. I asked for money how far
...,...,...
58,0,Tell where you reached
4876,0,I know dat feelin had it with Pete! Wuld get w...
1783,0,My uncles in Atlanta. Wish you guys a great se...
2210,0,"Hmm well, night night"


In [22]:
new_data_set = pd.concat([ham_s,spam])

In [23]:
new_data_set

Unnamed: 0,Category,Message
3569,0,Sent me de webadres for geting salary slip
3545,0,Lol ok ill try to send. Be warned Sprint is de...
4071,0,Ok i wont call or disturb any one. I know all ...
1383,0,Its ok my arm is feeling weak cuz i got a shot...
283,0,Ok. I asked for money how far
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [41]:
X = new_data_set['Message']

In [42]:
Y = new_data_set['Category']

In [43]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [44]:
tfidf = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_features = tfidf.fit_transform(x_train)  # Transforming the training data
x_test_features = tfidf.transform(x_test) 

In [45]:
model = LogisticRegression()
model.fit(x_train_features, y_train)

LogisticRegression()

In [46]:
y_pred = model.predict(x_test_features)

In [47]:
accuracy_score(y_test,y_pred)

0.9431438127090301

In [48]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[147   3]
 [ 14 135]]
              precision    recall  f1-score   support

           0       0.91      0.98      0.95       150
           1       0.98      0.91      0.94       149

    accuracy                           0.94       299
   macro avg       0.95      0.94      0.94       299
weighted avg       0.95      0.94      0.94       299



In [61]:
def predict_message(message):
    # Transform the message using the same TF-IDF vectorizer
    message_features = tfidf.transform([message])  # Remember to transform as a list
    
    # Predict the category (0 for ham, 1 for spam)
    prediction = model.predict(message_features)[0]
    
    # Convert the numerical prediction back to spam/ham label
    if prediction == 0:
        return "Ham"
    else:
        return "Spam"

# Test the function on a random message
sample_message = "Hello you won 400 dollors!"
result = predict_message(sample_message)
print(f"Message: '{sample_message}' is predicted as: {result}")

Message: 'Hello you won 400 dollors!' is predicted as: Spam
