In [1]:
import pandas as pd
import numpy as np
import pickle
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

spam_or_ham = pd.read_csv("../ml_training_set/spam_or_ham_cleaned.csv")
spam_or_ham.head()

Unnamed: 0,Label,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
spam_or_ham.describe()
# 0 = spam
# 1 = ham

Unnamed: 0,Label
count,5572.0
mean,0.865937
std,0.340751
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [3]:
(spam_or_ham["Label"] == 1).sum(), (spam_or_ham["Label"] == 0).sum(), (spam_or_ham["Label"] == 1).sum() / len(spam_or_ham)
# So it means that most of the data is "ham"

(4825, 747, 0.8659368269921034)

In [4]:
spam_or_ham.isna().count()

Label      5572
Message    5572
dtype: int64

In [5]:
spam_or_ham.dtypes

Label       int64
Message    object
dtype: object

In [6]:
X = spam_or_ham["Message"]
y = spam_or_ham["Label"]

vectorizer = TfidfVectorizer()

transformed_X = vectorizer.fit_transform(X)
transformed_X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 73911 stored elements and shape (5572, 8670)>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.979372197309417

In [8]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)
model2.score(X_test, y_test)

0.9632286995515695

In [9]:
prompt = 'you just won a prize! click here to win more'
transformed_prompt = vectorizer.transform([prompt])

model.predict_proba(transformed_prompt)

array([[0.63, 0.37]])

In [77]:
# now saving the model
pickle.dump(model, open("../trained_ml_model/spam_or_ham_ml_model.pkl", "wb"))

# save the TfidfVectorizer:
with open('../trained_ml_model/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# save the transformed sparse matrix:
scipy.sparse.save_npz('../trained_ml_model/transformed_X.npz', transformed_X)

In [10]:
# now opening everything and using it
# loading the TfidfVectorizer:
with open('../trained_ml_model/tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# loading the transformed sparse matrix:
loaded_transformed_X = scipy.sparse.load_npz('../trained_ml_model/transformed_X.npz')

# Load the model
with open('../trained_ml_model/spam_or_ham_ml_model.pkl', 'rb') as ml:
    loaded_model = pickle.load(ml)

In [11]:
prompt = 'you just won a prize! click here to win more'
prompt2 = 'your change to win a prize in cash 250€ every week. click on this link!'
transformed_prompt_loaded = loaded_vectorizer.transform([prompt])
transformed_prompt_loaded2 = loaded_vectorizer.transform([prompt2])

loaded_model.predict_proba(transformed_prompt_loaded), loaded_model.predict_proba(transformed_prompt_loaded2)

(array([[0.44, 0.56]]), array([[0.58, 0.42]]))