# Self-Training for Semi-Supervised Learning
This notebook demonstrates the implementation of the self-training approach in semi-supervised learning using Python libraries.

In [1]:
# Importing Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
import random
import matplotlib.pyplot as plt

# Setting up for reproducibility
random.seed(42)
np.random.seed(42)

## Step 1: Load Dataset
Here, we use the IMDB dataset available in NLTK for demonstration purposes.

In [2]:
# Loading the IMDB dataset from NLTK
from nltk.corpus import movie_reviews
import nltk
nltk.download('movie_reviews')

# Preparing the dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

# Splitting into reviews and labels
reviews = [" ".join(words) for words, label in documents]
labels = [label for words, label in documents]

# Encoding labels: positive -> 1, negative -> 0
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

# Splitting the data: 5% labeled, 95% unlabeled
X_train, X_unlabeled, y_train, y_unlabeled = train_test_split(
    reviews, labels_encoded, test_size=0.95, random_state=42
)

# Splitting further for validation and testing
X_val, X_test, y_val, y_test = train_test_split(
    X_unlabeled, y_unlabeled, test_size=0.5, random_state=42
)

ModuleNotFoundError: No module named 'nltk'

## Step 2: Train Initial Model
We create a logistic regression model and evaluate its performance on the validation set.

In [None]:
# Creating a vectorizer and logistic regression model
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
classifier = LogisticRegression(solver='liblinear', random_state=42)

# Creating a pipeline for convenience
model = make_pipeline(vectorizer, classifier)

# Training the model on labeled data
model.fit(X_train, y_train)

# Evaluating the model on the validation set
y_val_pred = model.predict(X_val)
initial_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Initial Validation Accuracy: {initial_accuracy:.2f}")

## Step 3: Implement Self-Training Loop
This loop iteratively uses pseudo-labels to retrain the model with high-confidence unlabeled samples.

In [None]:
# Self-Training Parameters
confidence_threshold = 0.8
iterations = 5
validation_accuracies = []

# Self-Training Loop
for i in range(iterations):
    print(f"\nIteration {i+1}:")

    # Predict pseudo-labels for unlabeled data
    probabilities = model.predict_proba(X_unlabeled)
    pseudo_labels = np.argmax(probabilities, axis=1)
    confidence_scores = np.max(probabilities, axis=1)

    # Select high-confidence samples
    high_confidence_idx = np.where(confidence_scores > confidence_threshold)[0]
    X_pseudo = [X_unlabeled[idx] for idx in high_confidence_idx]
    y_pseudo = pseudo_labels[high_confidence_idx]

    # Combine pseudo-labeled data with labeled data
    X_combined = np.concatenate([X_train, X_pseudo])
    y_combined = np.concatenate([y_train, y_pseudo])

    # Retrain the model on the combined dataset
    model.fit(X_combined, y_combined)

    # Evaluate the model on the validation set
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    validation_accuracies.append(val_accuracy)
    print(f"Validation Accuracy after iteration {i+1}: {val_accuracy:.2f}")

    # Remove high-confidence samples from the unlabeled data
    X_unlabeled = np.delete(X_unlabeled, high_confidence_idx, axis=0)

## Step 4: Plot Validation Accuracy Across Iterations
This visualization shows how the model improves over successive iterations.

In [None]:
# Plotting the validation accuracies
plt.figure(figsize=(10, 6))
plt.plot(range(1, iterations + 1), validation_accuracies, marker='o', linestyle='--', color='b')
plt.title("Validation Accuracy Across Iterations")
plt.xlabel("Iteration")
plt.ylabel("Validation Accuracy")
plt.xticks(range(1, iterations + 1))
plt.grid()
plt.show()

## Step 5: Evaluate Final Model on Test Data
Finally, we evaluate the model's performance on the held-out test set.

In [None]:
# Evaluate the final model on the test set
y_test_pred = model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nFinal Test Accuracy: {final_accuracy:.2f}")