In [2]:
#importing packages, pandas to view data, re to look for regular expressions, and sklearn to make preds
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [4]:
#loading dataframe and deleting unwanted columns
df = pd.read_csv('spam.csv', encoding='latin-1')
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

#removing any special characters while keeping whitespaces
df['v2'] = df['v2'].apply(lambda x: re.sub('[^A-Za-z0-9\s]+', '', x))

#replacing all the 'spams' with 1s and all the 'hams' with 0s
df['v1'] = df['v1'].replace({'spam': 1, 'ham': 0})

#changing up column names
df.rename(columns={'v1': 'result'}, inplace=True)
df.rename(columns={'v2': 'input'}, inplace=True)

df.head()


Unnamed: 0,result,input
0,0,Go until jurong point crazy Available only in ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor U c already then say
4,0,Nah I dont think he goes to usf he lives aroun...


In [40]:
df.shape

(5572, 2)

In [5]:
#You could input any of your own text here to see if it is either spam or ham. 
new_text = ["Limited time offer: Buy one, get one free.", "Congratulations! You have been selected to win a free vacation."]


In [8]:

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('classifier', LogisticRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['input'], df['result'], test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Use the pipeline to predict the labels of the test data
y_pred = pipeline.predict(X_test)

# Predict the probabilities using proba
y_proba = pipeline.predict_proba(new_text)
for i in range(len(new_text)):
    print(new_text[i], "-> Prediction:", y_pred[i], "Probability of ham:", y_proba[i][0], "Probability of spam:", y_proba[i][1])

#Print a classification report to evaluate the performance of the model
print(classification_report(y_test, y_pred))

Limited time offer: Buy one, get one free. -> Prediction: 0 Probability of ham: 0.7553724009868682 Probability of spam: 0.2446275990131318
Congratulations! You have been selected to win a free vacation. -> Prediction: 0 Probability of ham: 0.21756629349131584 Probability of spam: 0.7824337065086842
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.98      0.65      0.78       150

    accuracy                           0.95      1115
   macro avg       0.96      0.82      0.88      1115
weighted avg       0.95      0.95      0.95      1115

