Step 1: Load Dataset

In [92]:
import pandas as pd
import numpy as np

In [93]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [94]:
# Encode labels (ham = 0, spam = 1)
df['label']=df['label'].map({'ham':0,'spam':1})
df=df.dropna()

Step 2: Preprocess Text

In [95]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))
stop_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [96]:
import re
import string

In [97]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)             # remove digits
    text = re.sub(r'\W+', ' ', text)            # remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()    # trim whitespace
    return text

In [98]:
# Remove empty or whitespace-only messages after cleaning
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].str.strip() != '']


Convert to TF-IDF features

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [100]:
X=df['message'].values
y=df['label'].values

In [101]:
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(df['message'])


Step 3: Simulate Semi-Supervised Setup

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [104]:
y_train = np.array(y_train, dtype=int)
y_train_semi = np.copy(y_train)
n_labeled_points = int(0.2 * len(y_train))
y_train_semi[n_labeled_points:] = -1
print("NaN in y_train_semi?", np.isnan(y_train_semi).any())  # Should be False now


NaN in y_train_semi? False


In [105]:
print("✅ Labeled set classes:", np.unique(y_train_semi[y_train_semi != -1], return_counts=True))
print("✅ Unlabeled count:", np.sum(y_train_semi == -1))
print("✅ y_test classes:", np.unique(y_test, return_counts=True))


✅ Labeled set classes: (array([0, 1]), array([777, 114]))
✅ Unlabeled count: 3564
✅ y_test classes: (array([0, 1]), array([955, 159]))


Step 4: Train Label Spreading Model

In [106]:
from sklearn.semi_supervised import LabelSpreading

In [107]:
label_model = LabelSpreading(kernel='knn', n_neighbors=10, alpha=0.2)
label_model.fit(X_train.toarray(), y_train_semi)


Step 5: Evaluate on test set

In [108]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [109]:
y_pred=label_model.predict(X_test.toarray())
y_test=np.array(y_test,dtype="int")

In [110]:
precision=precision_score(y_test, y_pred, zero_division=1)
recall=recall_score(y_test, y_pred, zero_division=1)
f1=f1_score(y_test, y_pred, zero_division=1)
accuracy=accuracy_score(y_test, y_pred)

In [111]:
print(" Evaluation Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")


 Evaluation Metrics:
Accuracy  : 0.9686
Precision : 0.9697
Recall    : 0.8050
F1 Score  : 0.8797
