## NLP Pipeline
Overview of full text processing pipeline: text input → preprocessing → vectorization → model → output.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Sample dataset
df = pd.DataFrame({
    'text': ["I am happy", "This is bad", "I love it", "I hate it"],
    'label': [1, 0, 1, 0]
})

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.5, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

## Text Preprocessing
Steps include: tokenization, lowercasing, punctuation removal, stopword removal, stemming, and lemmatization.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "Cats are running, dogs are barking."
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
filtered = [t for t in tokens if t.isalpha() and t not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in filtered]
print(lemmas)

## Bag of Words & TF-IDF
Vectorizing text using BoW and TF-IDF using sklearn. Comparing sparsity and feature analysis.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

docs = ["I love NLP", "NLP is fun", "I hate bugs"]

cv = CountVectorizer()
bow = cv.fit_transform(docs)
print("BoW features:", cv.get_feature_names_out(), bow.toarray())

tfidf = TfidfTransformer()
tfidf_vec = tfidf.fit_transform(bow)
print("TF-IDF array:\n", tfidf_vec.toarray())

## Word2Vec
Training CBOW and Skip-Gram models using Gensim or PyTorch. Visualizing embeddings.

In [None]:
from gensim.models import Word2Vec

sentences = [["I", "love", "NLP"], ["NLP", "is", "fun"], ["we", "love", "learning"]]
model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, epochs=50)
print(model.wv["nlp"])

## Text Classification
Using logistic regression or a neural network on BoW/TF-IDF vectors or embeddings.

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

cats = ['alt.atheism', 'soc.religion.christian']
data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=cats, shuffle=True, random_state=42)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

pipeline.fit(data.data, data.target)
pred = pipeline.predict(data_test.data)
print(classification_report(data_test.target, pred))