## Practical Demonstration with Python

### Code: Implementation of Self-training with SSL



### Text Classification (Spam Detection):
We'll simulate this using the SMS Spam Collection Dataset (a common dataset for spam classification). In this scenario, we'll treat most of the data as unlabeled and only use a small portion as labeled.

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# 1. Load the dataset from a local file
data = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# 2. Encode labels and create features
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
X = data['message']
y = data['label']

vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

# 3. Split into labeled and unlabeled sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Use a small portion of labeled data
labeled_idx = np.random.choice(np.arange(len(y_train)), size=int(0.1 * len(y_train)), replace=False)
unlabeled_idx = np.setdiff1d(np.arange(len(y_train)), labeled_idx)

# Mark most of y_train as unlabeled (-1)
y_train_semi = np.copy(y_train)
y_train_semi[unlabeled_idx] = -1

# 4. Initialize and train the Self-Training Classifier
base_model = MultinomialNB()
ssl_model = SelfTrainingClassifier(base_model, criterion='k_best', k_best=50, max_iter=5)
ssl_model.fit(X_train, y_train_semi)

# 5. Evaluate the model
y_pred = ssl_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9794
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.91      0.94      0.92       149

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Sentiment Analysis
Perform sentiment analysis to classify movie reviews as positive or negative using semi-supervised learning using generic Sentiment140-like dataset.

In [24]:
# 1. Load the dataset

data = pd.read_csv('train.csv')
data = data[['label', 'tweet']]  # 'label' (1=positive, 0=negative), 'tweet' (text)

# 2. Encode the labels
data['label'] = data['label'].map({0: 'negative', 1: 'positive'})

# 3. Split the data into labeled and unlabeled sets
labeled_data, unlabeled_data = train_test_split(data, test_size=0.9, stratify=data['label'], random_state=42)

# 4. Prepare data for vectorization
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
labeled_X = vectorizer.fit_transform(labeled_data['tweet'])
unlabeled_X = vectorizer.transform(unlabeled_data['tweet'])

# Combine labeled and unlabeled for semi-supervised learning
X_combined = np.vstack((labeled_X.toarray(), unlabeled_X.toarray()))
y_combined = np.concatenate((labeled_data['label'], [-1] * len(unlabeled_data)))

# 5. Define the base classifier and the self-training wrapper
base_clf = MultinomialNB()
self_training_clf = SelfTrainingClassifier(base_clf, max_iter=10)

# 6. Train the model
self_training_clf.fit(X_combined, y_combined)

# 7. Evaluate the model on a test set
X_test = vectorizer.transform(unlabeled_data['tweet'])
y_test = unlabeled_data['label']
y_pred = self_training_clf.predict(X_test)

# 8. Print results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9337064590141139

Classification Report:
               precision    recall  f1-score   support

    negative       0.95      0.98      0.96     26748
    positive       0.54      0.38      0.45      2018

    accuracy                           0.93     28766
   macro avg       0.75      0.68      0.71     28766
weighted avg       0.93      0.93      0.93     28766

