# Assignment 11: NLP and Naive Bayes

## Text Classification and Sentiment Analysis on Blog Posts

**Topics Covered:**
- Text Preprocessing (Tokenization, Stopwords, Cleaning)
- TF-IDF Feature Extraction
- Naive Bayes Classification
- Sentiment Analysis

---
## Step 1: Import Libraries and Load Data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load dataset
df = pd.read_csv('blogs.csv')

print("Dataset loaded! Shape:", df.shape)
df.head()

---
## Step 2: Data Exploration

In [None]:
# Check data info
print("=== Data Info ===")
print(df.dtypes)
print("\n=== Missing Values ===")
print(df.isnull().sum())

In [None]:
# Category distribution
print("=== Category Distribution ===")
category_counts = df['Labels'].value_counts()
print(category_counts)

plt.figure(figsize=(10, 6))
plt.bar(category_counts.index, category_counts.values, color='steelblue')
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Distribution of Blog Categories')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('category_distribution.png')
plt.show()

In [None]:
# Sample text
print("=== Sample Blog Post ===")
print(df['Data'].iloc[0][:500])

---
## Step 3: Text Preprocessing

In [None]:
# Text cleaning function
def clean_text(text):
    # Convert to string
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply cleaning
print("=== Cleaning Text ===")
df['cleaned_text'] = df['Data'].apply(clean_text)
print("Text cleaning complete!")

print("\nBefore cleaning:")
print(df['Data'].iloc[0][:200])
print("\nAfter cleaning:")
print(df['cleaned_text'].iloc[0][:200])

In [None]:
# Remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    
    filtered_words = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    
    return ' '.join(filtered_words)

print("=== Removing Stopwords ===")
df['processed_text'] = df['cleaned_text'].apply(remove_stopwords)
print("Stopwords removed!")

print("\nAfter stopword removal:")
print(df['processed_text'].iloc[0][:200])

---
## Step 4: Feature Extraction (TF-IDF)

In [None]:
# TF-IDF Vectorization
print("=== TF-IDF Feature Extraction ===")

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform
X = tfidf.fit_transform(df['processed_text'])
y = df['Labels']

print("TF-IDF matrix shape:", X.shape)
print("Number of features (words):", len(tfidf.get_feature_names_out()))

# Show sample features
print("\nSample features (words):")
print(tfidf.get_feature_names_out()[:20])

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set:", X_train.shape[0])
print("Testing set:", X_test.shape[0])

---
## Step 5: Naive Bayes Classification

In [None]:
# Train Naive Bayes model
print("=== Training Naive Bayes ===")

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

print("Model trained!")

In [None]:
# Make predictions
y_pred = nb_model.predict(X_test)

# Evaluate
print("=== Model Evaluation ===")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))
print("F1-Score:", round(f1, 4))

In [None]:
# Classification Report
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Naive Bayes')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

---
## Step 6: Sentiment Analysis

In [None]:
# Sentiment Analysis using TextBlob
print("=== Sentiment Analysis ===")

def get_sentiment(text):
    # Get polarity score (-1 to 1)
    blob = TextBlob(str(text))
    polarity = blob.sentiment.polarity
    
    # Classify sentiment
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis
df['sentiment'] = df['Data'].apply(get_sentiment)

print("Sentiment analysis complete!")

In [None]:
# Sentiment distribution
print("=== Sentiment Distribution ===")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)

plt.figure(figsize=(8, 6))
colors = {'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'}
plt.bar(sentiment_counts.index, sentiment_counts.values, 
        color=[colors[s] for s in sentiment_counts.index])
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Distribution of Sentiments')
plt.savefig('sentiment_distribution.png')
plt.show()

In [None]:
# Sentiment by Category
print("=== Sentiment by Category ===")

sentiment_by_category = pd.crosstab(df['Labels'], df['sentiment'])
print(sentiment_by_category)

# Plot
sentiment_by_category.plot(kind='bar', figsize=(12, 6), color=['red', 'gray', 'green'])
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Sentiment Distribution by Category')
plt.legend(title='Sentiment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('sentiment_by_category.png')
plt.show()

---
## Summary

### Key Findings:

1. **Text Classification:**
   - Naive Bayes performed well on blog post classification
   - TF-IDF effectively captured important words for each category

2. **Sentiment Analysis:**
   - Most blog posts have neutral sentiment
   - Different categories show different sentiment patterns

### Topics Covered:
- Text preprocessing (cleaning, tokenization, stopword removal)
- TF-IDF feature extraction
- Multinomial Naive Bayes classification
- Sentiment analysis using TextBlob