# Natural Language Processing with Text Classification

This notebook demonstrates various NLP techniques using a text classification task. We'll cover:
1. Text preprocessing and cleaning
2. Feature extraction (Bag of Words, TF-IDF)
3. Word embeddings (Word2Vec)
4. Model training and evaluation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import gensim
from gensim.models import Word2Vec
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set random seed for reproducibility
np.random.seed(42)

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Create a DataFrame
df = pd.DataFrame({
    'text': newsgroups.data,
    'target': newsgroups.target,
    'category': [newsgroups.target_names[i] for i in newsgroups.target]
})

# Display basic information
print(f"Number of documents: {len(df)}")
print(f"Number of categories: {len(categories)}")
print("\nCategory distribution:")
print(df['category'].value_counts())

In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

In [None]:
# Preprocess the text
df['processed_text'] = df['text'].apply(preprocess_text)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'],
    df['target'],
    test_size=0.2,
    random_state=42,
    stratify=df['target']
)

In [None]:
# Bag of Words representation
bow_vectorizer = CountVectorizer(max_features=1000)
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)

# Make predictions
y_pred = nb_classifier.predict(X_test_bow)

# Evaluate the model
print("Naive Bayes with Bag of Words:")
print(classification_report(y_test, y_pred, target_names=categories))

In [None]:
# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)
lr_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = lr_classifier.predict(X_test_tfidf)

# Evaluate the model
print("Logistic Regression with TF-IDF:")
print(classification_report(y_test, y_pred, target_names=categories))

In [None]:
# Word2Vec embeddings
# Prepare text for Word2Vec
sentences = [text.split() for text in df['processed_text']]

# Train Word2Vec model
word2vec_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10
)

# Function to get document vectors
def get_document_vector(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Get document vectors
X_train_w2v = np.array([get_document_vector(text, word2vec_model) for text in X_train])
X_test_w2v = np.array([get_document_vector(text, word2vec_model) for text in X_test])

# Train a Logistic Regression classifier
lr_w2v = LogisticRegression(max_iter=1000, random_state=42)
lr_w2v.fit(X_train_w2v, y_train)

# Make predictions
y_pred = lr_w2v.predict(X_test_w2v)

# Evaluate the model
print("Logistic Regression with Word2Vec:")
print(classification_report(y_test, y_pred, target_names=categories))

In [None]:
# Plot confusion matrix for the best performing model
plt.figure(figsize=(10, 8))
sns.heatmap(
    confusion_matrix(y_test, y_pred),
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=categories,
    yticklabels=categories
)
plt.title('Confusion Matrix - Logistic Regression with Word2Vec')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Visualize word embeddings using PCA
from sklearn.decomposition import PCA

# Get word vectors
words = list(word2vec_model.wv.key_to_index.keys())[:100]
word_vectors = np.array([word2vec_model.wv[word] for word in words])

# Apply PCA
pca = PCA(n_components=2)
word_vectors_2d = pca.fit_transform(word_vectors)

# Plot
plt.figure(figsize=(15, 10))
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1])
for i, word in enumerate(words):
    plt.annotate(word, (word_vectors_2d[i, 0], word_vectors_2d[i, 1]))
plt.title('Word Embeddings Visualization (PCA)')
plt.show()

## Conclusion

In this notebook, we explored various NLP techniques for text classification:

1. **Text Preprocessing**:
   - Implemented text cleaning, tokenization, and stemming
   - Removed stopwords and special characters
   - Normalized text to lowercase

2. **Feature Extraction**:
   - Bag of Words representation
   - TF-IDF vectorization
   - Word2Vec embeddings

3. **Model Performance**:
   - Logistic Regression with TF-IDF performed well
   - Word2Vec embeddings provided semantic understanding
   - Naive Bayes with Bag of Words was a good baseline

4. **Key Insights**:
   - Different feature extraction methods have different strengths
   - Word embeddings capture semantic relationships
   - Text preprocessing is crucial for good performance

This notebook serves as a good starting point for understanding NLP techniques and their application to text classification problems. 