In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [7]:
data = pd.read_csv(r"C:\Users\HP\Documents\csv\spam_ham.csv")

In [9]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [13]:
# Converting 'spam' as 1(True) and 'ham' as 0(False)  - IF NEEDED -
data.loc[data['Category'] == 'spam', 'Category'] = 1
data.loc[data['Category'] == 'ham', 'Category'] = 0
data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...
5571,0,Rofl. Its true to its name


In [17]:
print(data['Category'].value_counts())

Category
0                 4825
1                  747
{"mode":"full"       1
Name: count, dtype: int64


In [21]:
# Balancing the values to avoid bias
from sklearn.utils import resample

spam = data[data['Category'] == 1]
ham = data[data['Category'] == 0]

# Undersample ham to match the number of ham messages
ham_downsampled = resample(ham, replace=False, n_samples=len(spam), random_state=42)

# Combine balanced data
balanced_data = pd.concat([spam, ham_downsampled])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_data['Category'].value_counts())

Category
0    747
1    747
Name: count, dtype: int64


In [23]:
X = balanced_data["Message"]
Y = balanced_data["Category"]

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [27]:
X_train.shape

(1195,)

In [29]:
X_train

532     * Was a nice day and, impressively, i was sens...
534     I'm so in love with you. I'm excited each day ...
1108    You have won a guaranteed £200 award or even £...
490     Ok thats cool. Its , just off either raglan rd...
933     Valentines Day Special! Win over £1000 in our ...
                              ...                        
1130                               Are you there in room.
1294    I love u 2 my little pocy bell I am sorry but ...
860        I jokin oni lar.. Ü busy then i wun disturb ü.
1459    So how are you really. What are you up to. How...
1126               Except theres a chick with huge boobs.
Name: Message, Length: 1195, dtype: object

In [31]:
# Extracting features from data
feature_extraction = TfidfVectorizer(min_df = 1, ngram_range=(1,2), stop_words = "english", lowercase = True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [33]:
print(X_train_features)

  (0, 7479)	0.1754468882732059
  (0, 3578)	0.13268814303731882
  (0, 5704)	0.23156400559293633
  (0, 9201)	0.23156400559293633
  (0, 11487)	0.19232802125227239
  (0, 5493)	0.15373782390415805
  (0, 4041)	0.19232802125227239
  (0, 4389)	0.17817204627266775
  (0, 4450)	0.19232802125227239
  (0, 5924)	0.11097907866827099
  (0, 2308)	0.23156400559293633
  (0, 8707)	0.19232802125227239
  (0, 7480)	0.19715595264225375
  (0, 3596)	0.23156400559293633
  (0, 5705)	0.23156400559293633
  (0, 9202)	0.23156400559293633
  (0, 11489)	0.23156400559293633
  (0, 5504)	0.23156400559293633
  (0, 4043)	0.23156400559293633
  (0, 4392)	0.23156400559293633
  (0, 4454)	0.23156400559293633
  (0, 5943)	0.23156400559293633
  (0, 2309)	0.23156400559293633
  (1, 3578)	0.1940278702679132
  (1, 6570)	0.21614157340254833
  :	:
  (1191, 6386)	0.3214448989532906
  (1191, 8183)	0.3214448989532906
  (1191, 2149)	0.3214448989532906
  (1191, 9588)	0.3214448989532906
  (1192, 5895)	0.2951787588049516
  (1192, 6148)	0.2659001

In [35]:
Model = LogisticRegression()
Model.fit(X_train_features, Y_train)

In [37]:
train_predictions = Model.predict(X_train_features)
train_accuracy = accuracy_score(Y_train, train_predictions)
train_report = classification_report(Y_train, train_predictions)
print(f"train_accuracy: {train_accuracy*100:.2f}%")
print(train_report)

train_accuracy: 99.33%
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       603
           1       1.00      0.99      0.99       592

    accuracy                           0.99      1195
   macro avg       0.99      0.99      0.99      1195
weighted avg       0.99      0.99      0.99      1195



In [39]:
test_predictions = Model.predict(X_test_features)
val_accuracy = accuracy_score(Y_test, test_predictions)
val_report = classification_report(Y_test, test_predictions)
print(f"val_accuracy: {val_accuracy*100:.2f}%")
print(val_report)

val_accuracy: 96.99%
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       144
           1       0.99      0.95      0.97       155

    accuracy                           0.97       299
   macro avg       0.97      0.97      0.97       299
weighted avg       0.97      0.97      0.97       299



In [41]:
def spam_detector(text:str):
    text_features = feature_extraction.transform(text)
    prediction = Model.predict(text_features)
    
    if prediction == 1:
        print("The text is Spam!")
    else:
        print("The text is Ham!")

In [49]:
# this should be SPAM!
spam_detector(["We've extended our Expose our Logo contest by one month! The brief is simple - expose the Freelancer logo for your chance to win $10,000. Head to the contest page to find out more details. https://www.freelancer.com/contest/Freelancercom-Expose-our-Logo-Challenge-2475191/details"])

The text is Spam!


In [51]:
# this should be SPAM!
spam_detector(["""Turn your dreams into reality this Valentine's Day with Valeo's Virtual GLP-1 Weight Loss Program. Complimentary online doctor consultation, effective GLP-1 medications, supplements to boost metabolism, and at-home blood tests. Book now to get your free online doctor consultation and start your ultimate weight loss journey!
Kristen Els' transformation story: She lost 27 kg with Valeo's Online GLP-1 Weight Loss Program. The before and after pictures showcase her incredible journey, with support from home blood tests, Mounjaro delivery, and at-home nurse services. Kristen feels better both mentally and physically. Start your transformation today with Valeo's Online GLP-1 Weight Loss Program.
Lose up to 25% of your weight with FDA-Approved Medications. Featuring Mounjaro (weekly injection for blood sugar control and appetite management), Wegovy (weekly Semaglutide injection to reduce hunger and cravings), Ozempic (weekly Semaglutide injection for blood sugar control and weight management), and Rybelsus (daily Semaglutide tablet for appetite control and weight management). Shop now.
Book your first complimentary Dr. Consultation and reach your weight goal safely and effectively.
Health & Wellness as it should be: Easy, convenient & impactful. Download the Valeo Health app now and stay connected."""])

The text is Ham!


In [53]:
# this should be HAM!
spam_detector(["""Hello, Thank you for your reply and I apologize for the delayed response. Since you mentioned that you are currently using your brother's payment method, could you please provide us with their full legal name? Thank you! Best wishes! Goya The Patreon Team"""])

The text is Ham!
