# 3. Advanced NLP Techniques

**Project:** Text EDA (20 Newsgroups)  
**Goal:** Apply advanced NLP techniques including Named Entity Recognition, POS tagging, text classification, and advanced topic modeling.

---

## 1. Imports and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import os

# Download required NLTK data
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

# Try importing spaCy (show error if not installed)
try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    SPACY_AVAILABLE = True
except:
    print("spaCy not available. Install with: pip install spacy && python -m spacy download en_core_web_sm")
    SPACY_AVAILABLE = False

# Try importing TextBlob
try:
    from textblob import TextBlob
    TEXTBLOB_AVAILABLE = True
except:
    print("TextBlob not available. Install with: pip install textblob")
    TEXTBLOB_AVAILABLE = False

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Dataset
We'll continue using the 20 Newsgroups dataset from previous notebooks.

In [None]:
categories = ['sci.space', 'comp.graphics', 'talk.politics.mideast', 'rec.sport.hockey']
data_home = '../../data/raw/scikit_learn_data'
if not os.path.exists(data_home):
    os.makedirs(data_home)

newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), 
                               categories=categories, data_home=data_home)

df = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target})
df['category'] = df['target'].map(lambda x: newsgroups.target_names[x])

print(f"Dataset Shape: {df.shape}")
print(f"Categories: {newsgroups.target_names}")

## 3. Named Entity Recognition (NER)
Extract and analyze named entities (persons, organizations, locations) from the text.

In [None]:
if SPACY_AVAILABLE:
    # Sample subset for NER (processing all texts is slow)
    sample_texts = df['text'].sample(100, random_state=42)
    
    entities = {'PERSON': [], 'ORG': [], 'GPE': []}
    
    for text in sample_texts:
        doc = nlp(text[:1000])  # Limit to first 1000 chars for speed
        for ent in doc.ents:
            if ent.label_ in entities:
                entities[ent.label_].append(ent.text)
    
    # Visualize top entities
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    for idx, (entity_type, entity_list) in enumerate(entities.items()):
        if entity_list:
            entity_counts = pd.Series(entity_list).value_counts().head(10)
            axes[idx].barh(entity_counts.index, entity_counts.values)
            axes[idx].set_title(f'Top {entity_type} Entities')
            axes[idx].set_xlabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Total Persons: {len(entities['PERSON'])}")
    print(f"Total Organizations: {len(entities['ORG'])}")
    print(f"Total Locations: {len(entities['GPE'])}")
else:
    print("spaCy not available for NER analysis")

## 4. Part-of-Speech (POS) Tagging
Analyze the grammatical structure of the text.

In [None]:
if SPACY_AVAILABLE:
    # Analyze POS distribution
    sample_text = df['text'].iloc[0]
    doc = nlp(sample_text[:500])
    
    pos_tags = [token.pos_ for token in doc]
    pos_counts = pd.Series(pos_tags).value_counts()
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=pos_counts.values, y=pos_counts.index, palette='viridis')
    plt.title('Part-of-Speech Distribution (Sample Text)')
    plt.xlabel('Count')
    plt.ylabel('POS Tag')
    plt.show()
    
    # Extract noun phrases
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    print(f"\nSample Noun Phrases: {noun_phrases[:10]}")
else:
    print("spaCy not available for POS tagging")

## 5. Text Classification
Build and compare classification models to predict document categories.

In [None]:
# Prepare data for classification
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Training set size: {X_train_tfidf.shape}")
print(f"Test set size: {X_test_tfidf.shape}")

In [None]:
# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)

print("Logistic Regression Results:")
print(classification_report(y_test, lr_pred, target_names=newsgroups.target_names))

In [None]:
# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

print("Naive Bayes Results:")
print(classification_report(y_test, nb_pred, target_names=newsgroups.target_names))

In [None]:
# Confusion Matrix Comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, (pred, title) in enumerate([(lr_pred, 'Logistic Regression'), (nb_pred, 'Naive Bayes')]):
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=newsgroups.target_names, yticklabels=newsgroups.target_names)
    axes[idx].set_title(f'{title} Confusion Matrix')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 6. Advanced Topic Modeling
Compare LDA vs NMF and evaluate with coherence scores.

In [None]:
# Prepare data for topic modeling
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=1000)
doc_term_matrix = vectorizer.fit_transform(df['text'])

n_topics = 4
n_top_words = 10

# LDA Model
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(doc_term_matrix)

# NMF Model
nmf = NMF(n_components=n_topics, random_state=42)
nmf.fit(doc_term_matrix)

print("Models trained successfully")

In [None]:
def display_topics(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[-n_top_words:][::-1]]
        topics.append(f"Topic {topic_idx}: {' '.join(top_words)}")
    return topics

feature_names = vectorizer.get_feature_names_out()

print("\nLDA Topics:")
lda_topics = display_topics(lda, feature_names, n_top_words)
for topic in lda_topics:
    print(topic)

print("\nNMF Topics:")
nmf_topics = display_topics(nmf, feature_names, n_top_words)
for topic in nmf_topics:
    print(topic)

In [None]:
# Evaluate models with perplexity (for LDA)
lda_perplexity = lda.perplexity(doc_term_matrix)
print(f"\nLDA Perplexity: {lda_perplexity:.2f}")
print("(Lower perplexity indicates better performance)")

# Calculate reconstruction error for NMF
W = nmf.transform(doc_term_matrix)
H = nmf.components_
reconstruction_error = np.linalg.norm(doc_term_matrix.toarray() - W @ H, 'fro')
print(f"NMF Reconstruction Error: {reconstruction_error:.2f}")

## 7. Multiple Sentiment Analysis Methods
Compare VADER and TextBlob sentiment analysis.

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# VADER sentiment
sia = SentimentIntensityAnalyzer()
df['vader_sentiment'] = df['text'].apply(lambda x: sia.polarity_scores(x[:500])['compound'])

# TextBlob sentiment
if TEXTBLOB_AVAILABLE:
    df['textblob_sentiment'] = df['text'].apply(lambda x: TextBlob(x[:500]).sentiment.polarity)
    
    # Compare methods
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    for idx, col in enumerate(['vader_sentiment', 'textblob_sentiment']):
        sns.histplot(data=df, x=col, hue='category', kde=True, bins=30, ax=axes[idx])
        axes[idx].set_title(f'{col.replace("_", " ").title()} Distribution')
        axes[idx].set_xlabel('Sentiment Score')
    
    plt.tight_layout()
    plt.show()
    
    # Correlation between methods
    correlation = df[['vader_sentiment', 'textblob_sentiment']].corr().iloc[0, 1]
    print(f"\nCorrelation between VADER and TextBlob: {correlation:.3f}")
    
    # Mean sentiment by category
    print("\nMean Sentiment by Category:")
    print(df.groupby('category')[['vader_sentiment', 'textblob_sentiment']].mean())
else:
    # Only VADER
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='vader_sentiment', hue='category', kde=True, bins=30)
    plt.title('VADER Sentiment Distribution by Category')
    plt.xlabel('Sentiment Score')
    plt.show()
    
    print("\nMean VADER Sentiment by Category:")
    print(df.groupby('category')['vader_sentiment'].mean())