In [1]:
# Import necessary libraries
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler
import math
import os # For checking file existence


In [2]:
# --- Configuration ---
# Set the paths to your data files
# IMPORTANT: Replace these with the actual paths to your files
data_dir = './' # Assume files are in the current directory
train_data_file = os.path.join(data_dir, 'trainData.txt')
test_data_file = os.path.join(data_dir, 'testData.txt')
train_label_file = os.path.join(data_dir, 'trainLabel.txt')
test_label_file = os.path.join(data_dir, 'testLabel.txt')
vocab_file = os.path.join(data_dir, 'words.txt')


In [1]:
# --- Helper Function to Load Data ---
def load_data(data_file, label_file, num_docs, num_words):
    """Loads document data and labels into a sparse matrix and label array."""
    print(f"Loading data from {data_file} and {label_file}...")

    # Initialize a sparse matrix (LIL format is efficient for construction)
    # Documents are rows, words are columns
    # Using 1-based indexing from files, so size is num_docs x num_words
    X = lil_matrix((num_docs, num_words), dtype=int)
    y = np.zeros(num_docs, dtype=int)

    # Read data file (docId wordId)
    doc_ids_present = set()
    try:
        with open(data_file, 'r') as f:
            for line in f:
                doc_id, word_id = map(int, line.strip().split())
                # Adjust to 0-based index for matrix
                if 1 <= doc_id <= num_docs and 1 <= word_id <= num_words:
                    X[doc_id - 1, word_id - 1] = 1 # Binary feature: word is present
                    doc_ids_present.add(doc_id)
                else:
                    print(f"Warning: Out of bounds index found in {data_file}: doc_id={doc_id}, word_id={word_id}")
    except FileNotFoundError:
        print(f"Error: Data file not found at {data_file}")
        return None, None
    except Exception as e:
        print(f"Error reading {data_file}: {e}")
        return None, None
    # Read label file (label per line, line number corresponds to docId)
    try:
        with open(label_file, 'r') as f:
            for i, line in enumerate(f):
                doc_id = i + 1 # Line number is docId (1-based)
                if doc_id in doc_ids_present: # Only load labels for docs present in data file
                     # Ensure the index is within the bounds of y
                    if 0 <= doc_id - 1 < num_docs:
                         y[doc_id - 1] = int(line.strip())
                    else:
                         print(f"Warning: doc_id {doc_id} from {label_file} out of bounds for y array (size {num_docs}).")

                # Handle cases where label file might have more lines than docs in data file
                elif doc_id > num_docs:
                    # print(f"Warning: More labels in {label_file} than specified num_docs ({num_docs}). Stopping label read.")
                    break # Stop if we exceed expected number of docs
    except FileNotFoundError:
        print(f"Error: Label file not found at {label_file}")
        return None, None
    except Exception as e:
        print(f"Error reading {label_file}: {e}")
        return None, None

    print(f"Loaded {X.shape[0]} documents and {len(np.unique(y))} labels.")
    # Convert to CSR format for efficient calculations
    return X.tocsr(), y


In [2]:
# --- Main Script ---
# 1. Load Vocabulary
print("Loading vocabulary...")
words = []
try:
    with open(vocab_file, 'r') as f:
        words = [line.strip() for line in f]
    num_words = len(words)
    if num_words == 0:
        raise ValueError("Vocabulary file is empty.")
    print(f"Vocabulary size: {num_words} words.")
except FileNotFoundError:
    print(f"Error: Vocabulary file not found at {vocab_file}")
    exit()
except Exception as e:
    print(f"Error reading {vocab_file}: {e}")
    exit()


Loading vocabulary...


NameError: name 'vocab_file' is not defined

In [3]:
# 2. Determine Number of Documents (using label files is often reliable)
print("Determining number of documents...")
try:
    with open(train_label_file, 'r') as f:
        num_train_docs = sum(1 for _ in f)
    with open(test_label_file, 'r') as f:
        num_test_docs = sum(1 for _ in f)
    print(f"Found {num_train_docs} training documents and {num_test_docs} testing documents.")
    if num_train_docs == 0 or num_test_docs == 0:
        raise ValueError("Label files indicate zero documents.")
except FileNotFoundError:
    print(f"Error: Label file(s) not found. Cannot determine document counts.")
    exit()
except Exception as e:
    print(f"Error reading label files: {e}")
    exit()



Determining number of documents...
Error reading label files: name 'train_label_file' is not defined


In [None]:
# 3. Load Training and Testing Data
X_train_sparse, y_train = load_data(train_data_file, train_label_file, num_train_docs, num_words)
X_test_sparse, y_test = load_data(test_data_file, test_label_file, num_test_docs, num_words)



In [1]:
# Check if loading was successful
if X_train_sparse is None or X_test_sparse is None:
    print("Failed to load data. Exiting.")
    exit()



NameError: name 'X_train_sparse' is not defined

In [None]:
# --- Part a: Naïve Bayes ---
print("\n--- Part a: Gaussian Naïve Bayes ---")


In [None]:
# i. Train GaussianNB and find discriminative words
print("Training Gaussian Naive Bayes classifier...")
gnb = GaussianNB()
gnb.fit(X_train_dense, y_train)
print("Training complete.")


In [None]:
# Calculate discriminative scores
# GaussianNB stores mean (theta_) and variance (var_) for each feature per class.
# Shape of theta_ and var_ is (n_classes, n_features)
# Labels are 1 and 2, so indices are 0 and 1 after fitting.
means_label1 = gnb.theta_[0, :]
vars_label1 = gnb.var_[0, :]
means_label2 = gnb.theta_[1, :]
vars_label2 = gnb.var_[1, :]


In [None]:
epsilon = 1e-9
vars_label1 = np.maximum(vars_label1, epsilon)
vars_label2 = np.maximum(vars_label2, epsilon)




In [None]:
# Calculate log probability density at x=1 (presence of word) for each word and class
# PDF of Normal(mu, sigma^2) is (1 / sqrt(2*pi*sigma^2)) * exp(-(x-mu)^2 / (2*sigma^2))
# Log PDF is -log(sqrt(2*pi*sigma^2)) - (x-mu)^2 / (2*sigma^2)
# We evaluate at x=1
x_val = 1
log_prob_word_label1 = -0.5 * np.log(2 * np.pi * vars_label1) - ((x_val - means_label1)**2 / (2 * vars_label1))
log_prob_word_label2 = -0.5 * np.log(2 * np.pi * vars_label2) - ((x_val - means_label2)**2 / (2 * vars_label2))



In [None]:
# Calculate the discriminative score
discriminative_scores = np.abs(log_prob_word_label1 - log_prob_word_label2)

In [None]:
# Get the indices of the top 10 discriminative words
top_10_indices = np.argsort(discriminative_scores)[-10:][::-1] # Sort descending

print("\nTop 10 most discriminative word features (GaussianNB):")
print("Rank | Word          | Score ( |log P(w|L1) - log P(w|L2)| )")
print("-----|---------------|----------------------------------------")
for i, word_index in enumerate(top_10_indices):
    word = words[word_index]
    score = discriminative_scores[word_index]
    print(f"{i+1:<4} | {word:<13} | {score:.4f}")


In [None]:
# Opinion on features: (Requires manual inspection of the words)
print("\nOpinion: Inspect the words above. Are they strongly related to one topic (e.g., 'god', 'atheism', 'christian', 'religion' for alt.atheism vs. comp.graphics)? If so, they are likely good discriminative features.")



In [None]:
# ii. Calculate and print accuracy
print("\nCalculating Naive Bayes accuracy...")
y_train_pred_gnb = gnb.predict(X_train_dense)
y_test_pred_gnb = gnb.predict(X_test_dense)

train_accuracy_gnb = accuracy_score(y_train, y_train_pred_gnb)
test_accuracy_gnb = accuracy_score(y_test, y_test_pred_gnb)

print(f"Gaussian Naive Bayes Training Accuracy: {train_accuracy_gnb:.4f} ({train_accuracy_gnb*100:.2f}%)")
print(f"Gaussian Naive Bayes Testing Accuracy:  {test_accuracy_gnb:.4f} ({test_accuracy_gnb*100:.2f}%)")




In [None]:
# --- Part b: SVM ---
print("\n--- Part b: Support Vector Machine (SVM) ---")


In [None]:
print("Scaling data using MaxAbsScaler...")
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train_sparse)
X_test_scaled = scaler.transform(X_test_sparse)
print("Scaling complete.")


In [None]:
# Train and evaluate SVM with Linear Kernel
print("\nTraining SVM with Linear Kernel...")
svm_linear = SVC(kernel='linear', random_state=42) # Added random_state for reproducibility
svm_linear.fit(X_train_scaled, y_train)
print("Training complete.")

print("Calculating Linear SVM accuracy...")
y_train_pred_svm_linear = svm_linear.predict(X_train_scaled)
y_test_pred_svm_linear = svm_linear.predict(X_test_scaled)

train_accuracy_svm_linear = accuracy_score(y_train, y_train_pred_svm_linear)
test_accuracy_svm_linear = accuracy_score(y_test, y_test_pred_svm_linear)

print(f"Linear SVM Training Accuracy: {train_accuracy_svm_linear:.4f} ({train_accuracy_svm_linear*100:.2f}%)")
print(f"Linear SVM Testing Accuracy:  {test_accuracy_svm_linear:.4f} ({test_accuracy_svm_linear*100:.2f}%)")



In [None]:
# Train and evaluate SVM with Polynomial Kernel
print("\nTraining SVM with Polynomial Kernel...")
svm_poly = SVC(kernel='poly', random_state=42) # Added random_state for reproducibility
svm_poly.fit(X_train_scaled, y_train)
print("Training complete.")

print("Calculating Polynomial SVM accuracy...")
y_train_pred_svm_poly = svm_poly.predict(X_train_scaled)
y_test_pred_svm_poly = svm_poly.predict(X_test_scaled)

train_accuracy_svm_poly = accuracy_score(y_train, y_train_pred_svm_poly)
test_accuracy_svm_poly = accuracy_score(y_test, y_test_pred_svm_poly)

print(f"Polynomial SVM Training Accuracy: {train_accuracy_svm_poly:.4f} ({train_accuracy_svm_poly*100:.2f}%)")
print(f"Polynomial SVM Testing Accuracy:  {test_accuracy_svm_poly:.4f} ({test_accuracy_svm_poly*100:.2f}%)")



In [None]:
# Plotting Decision Boundaries (Conceptual)
print("\nPlotting Decision Boundaries:")
print("Directly plotting decision boundaries for high-dimensional text data ({} features) is not feasible.".format(num_words))
print("To visualize, one typically reduces dimensionality (e.g., using PCA or t-SNE to 2D)")
print("and then plots the boundary in that reduced space. This provides an approximation.")
print("Code for plotting is omitted here but would involve:")
print("1. `from sklearn.decomposition import PCA`")
print("2. `pca = PCA(n_components=2)`")
print("3. `X_train_pca = pca.fit_transform(X_train_scaled.toarray())` (or dense if already converted)")
print("4. Train SVMs on `X_train_pca`")
print("5. Use a meshgrid and `svm.predict` to plot decision regions (like the scikit-learn example).")

print("\n--- End of Question 2 ---")