NLP TEXT CLASSIFICATION PROJECT TEMPLATE
==========================================
Use Case: Sentiment Analysis, Topic Classification, Spam Detection

# 1. PROJECT SETUP & ENVIRONMENT

## 1.1 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# NLP specific
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Deep Learning (optional)
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import torch

In [None]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

## 1.2 Configuration & Hyperparameters

In [None]:
CONFIG = {
    'random_state': 42,
    'test_size': 0.2,
    'val_size': 0.1,
    'max_features': 5000,
    'max_len': 128,
    'batch_size': 32,
    'epochs': 10,
    'learning_rate': 2e-5
}

# 2. DATA LOADING & EXPLORATION

## 2.1 Load Data

In [None]:
# Load your dataset
df = pd.read_csv('your_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## 2.2 Exploratory Data Analysis

In [None]:
# Basic statistics
print(df.info())
print("\n", df.describe())

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Check class distribution
print("\nClass distribution:")
print(df['label'].value_counts())

# Visualize class distribution
plt.figure(figsize=(10, 5))
df['label'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Text length analysis
df['text_length'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

fig, axes = plt.subplots(1, 2, figsize=(15, 5))
df['text_length'].hist(bins=50, ax=axes[0])
axes[0].set_title('Character Length Distribution')
df['word_count'].hist(bins=50, ax=axes[1])
axes[1].set_title('Word Count Distribution')
plt.show()

# 3. DATA PREPROCESSING

## 3.1 Text Cleaning Function

In [None]:
def clean_text(text):
    """Clean and preprocess text data"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

## 3.2 Tokenization & Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Tokenize, remove stopwords, and lemmatize"""
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
              if token not in stop_words and len(token) > 2]
    return ' '.join(tokens)

In [None]:
df['processed_text'] = df['cleaned_text'].apply(preprocess_text)
df[['text', 'cleaned_text', 'processed_text']].head()

# 4. FEATURE ENGINEERING

## 4.1 Text Vectorization (Choose One)

In [None]:
# Option 1: TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=CONFIG['max_features'], ngram_range=(1, 2))
X = tfidf.fit_transform(df['processed_text'])
y = df['label']

print(f"Feature matrix shape: {X.shape}")

In [None]:
# Option 2: Word Embeddings (Word2Vec)
# from gensim.models import Word2Vec
# 
# sentences = [text.split() for text in df['processed_text']]
# w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

In [None]:
# Option 3: Transformer Tokenization (BERT, RoBERTa, etc.)
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# encodings = tokenizer(df['processed_text'].tolist(), truncation=True, 
#                       padding=True, max_length=CONFIG['max_len'])

# 5. DATA SPLITTING

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=CONFIG['test_size'], 
    random_state=CONFIG['random_state'], stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# 6. MODEL BUILDING

## 6.1 Baseline Models

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=CONFIG['random_state'])
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))

## 6.2 Deep Learning Model (Optional)

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
# 
# model = Sequential([
#     Embedding(input_dim=CONFIG['max_features'], output_dim=128, input_length=CONFIG['max_len']),
#     Bidirectional(LSTM(64, return_sequences=True)),
#     Dropout(0.5),
#     Bidirectional(LSTM(32)),
#     Dense(64, activation='relu'),
#     Dropout(0.5),
#     Dense(num_classes, activation='softmax')
# ])
# 
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# history = model.fit(X_train, y_train, validation_split=0.2, 
#                     epochs=CONFIG['epochs'], batch_size=CONFIG['batch_size'])

# 7. MODEL EVALUATION

In [None]:
# Choose best model for evaluation
best_model = lr_model
y_pred = lr_pred

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 7.1 Cross-Validation

In [None]:
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# 8. MODEL INTERPRETATION

In [None]:
# Feature importance (for logistic regression with TF-IDF)
if hasattr(best_model, 'coef_'):
    feature_names = tfidf.get_feature_names_out()
    top_positive_features = np.argsort(best_model.coef_[0])[-10:]
    top_negative_features = np.argsort(best_model.coef_[0])[:10]
    
    print("Top positive features:")
    for idx in top_positive_features:
        print(f"{feature_names[idx]}: {best_model.coef_[0][idx]:.4f}")
    
    print("\nTop negative features:")
    for idx in top_negative_features:
        print(f"{feature_names[idx]}: {best_model.coef_[0][idx]:.4f}")

# 9. PREDICTION PIPELINE

In [None]:
def predict_text(text, model=best_model, vectorizer=tfidf):
    """Predict label for new text"""
    cleaned = clean_text(text)
    processed = preprocess_text(cleaned)
    vectorized = vectorizer.transform([processed])
    prediction = model.predict(vectorized)[0]
    probability = model.predict_proba(vectorized)[0]
    return prediction, probability

# Test the pipeline
sample_text = "This is a sample text to classify"
pred, proba = predict_text(sample_text)
print(f"Prediction: {pred}")
print(f"Probabilities: {proba}")

# 10. MODEL SAVING & DEPLOYMENT

In [None]:
import joblib

# Save model and vectorizer
joblib.dump(best_model, 'text_classifier_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Load model
# loaded_model = joblib.load('text_classifier_model.pkl')
# loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# 11. CONCLUSIONS & NEXT STEPS

## Summary:
- Dataset size: X samples
- Best model: [Model Name]
- Test accuracy: X.XX%
- Key findings: [Your insights]

## Next Steps:
- [ ] Try different preprocessing techniques
- [ ] Experiment with advanced models (BERT, GPT)
- [ ] Collect more training data
- [ ] Implement data augmentation
- [ ] Deploy to production
- [ ] Monitor model performance