 https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip

In [0]:
filepath = '/Volumes/workspace/sample_schema/sample_volume/SMSSpamCollection'

In [0]:
df = spark.read.text(filepath)
display(df)

In [0]:
# Read the text file into Spark and convert to Pandas
df = spark.read.text(filepath)
pdf = df.toPandas()

# Split label and message
pdf[['label', 'message']] = pdf['value'].str.split('\t', expand=True)
pdf = pdf.drop(columns='value')


In [0]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# Step 1: Separate ham and spam
spam_df = pdf[pdf['label'] == 'spam']
ham_df = pdf[pdf['label'] == 'ham']

# Step 2: Upsample spam to match ham
spam_upsampled = resample(spam_df,
                          replace=True,
                          n_samples=len(ham_df),
                          random_state=42)

# Step 3: Combine into a balanced dataset
balanced_df = pd.concat([ham_df, spam_upsampled])

# Step 4: Train-test split (after balancing)
X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['message'], balanced_df['label'], test_size=0.2, random_state=42)

# Step 5: TF-IDF and model training
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Step 6: Evaluation
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

In [0]:
sample = ["Congratulations! You won a free ticket to Bahamas!",
          "click this link",
          "review the contract - thomas@thomas",
          "review the contract and click the link thomas@thomas@thomas"]

sample_tfidf = vectorizer.transform(sample)
print(clf.predict(sample_tfidf))

In [0]:
pred_proba = clf.predict_proba(sample_tfidf)
for msg, prob in zip(sample, pred_proba):
    print(f"{msg} => Spam probability: {prob[1]:.2f}")
