In [None]:
# ✅ TfidfVectorizer() – Short Explanation
# TfidfVectorizer is a tool from Scikit-learn used to convert a collection of text documents into numerical feature vectors using TF-IDF weighting.


import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from collections import Counter

# 1. Sample corpus + labels (for a simple sentiment task)
docs = [
    "I loved the new Batman movie!",
    "The plot was boring and too long.",
    "What a fantastic performance by the lead actors.",
    "I hated the ending, it was terrible."
]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# 2. Text Preprocessing
stemmer = PorterStemmer()
stops = set(stopwords.words('english'))

def preprocess(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # tokenize + remove stopwords + stem
    tokens = [
        stemmer.stem(tok)
        for tok in text.split()
        if tok not in stops
    ]
    return " ".join(tokens)

docs_clean = [preprocess(d) for d in docs]

# 3. Syntax & Semantics (basic POS tagging + regex-based NER)
import nltk
nltk.download('averaged_perceptron_tagger', quiet=True)

def syntax_semantics(text):
    tokens = text.split()
    pos_tags = nltk.pos_tag(tokens)
    # super-simple “NER”: any capitalized word in original text
    ner = re.findall(r"\b[A-Z][a-z]+\b", text)
    return pos_tags, ner

for orig in docs:
    pos, ner = syntax_semantics(orig)
    print(f"\nORIGINAL: {orig}")
    print(" POS TAGS:", pos)
    print(" NER:", ner)

# 4. Vectorization + 5. Modeling (TF-IDF + Logistic Regression)
X_train, X_test, y_train, y_test = train_test_split(
    docs_clean, labels, test_size=0.5, random_state=42
)

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression())
])

pipeline.fit(X_train, y_train)

# Evaluate
acc = pipeline.score(X_test, y_test)
print(f"\nTest accuracy: {acc:.2f}")

# Predict on new sentence
new = "That movie was absolutely wonderful and thrilling."
new_clean = preprocess(new)
pred = pipeline.predict([new_clean])[0]
print(f"New sentence prediction (1=pos,0=neg): {pred}")


Certainly! Here's how you can integrate **Word2Vec** and **BERT** with a pipeline, similar to how you used **TF-IDF** with `LogisticRegression`.

### 1. **Word2Vec Integration with Pipeline**

We'll create a custom function for Word2Vec, and then integrate it into a Scikit-learn pipeline.

```python
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Sample corpus + labels
docs = [
    "I loved the new Batman movie!",
    "The plot was boring and too long.",
    "What a fantastic performance by the lead actors.",
    "I hated the ending, it was terrible."
]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Preprocessing
stemmer = PorterStemmer()
stop = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = [stemmer.stem(tok) for tok in text.split() if tok not in stop]
    return tokens

docs_clean = [preprocess(d) for d in docs]

# Word2Vec training
model = Word2Vec(sentences=docs_clean, vector_size=50, window=5, min_count=1)

# Custom Transformer for Word2Vec embedding conversion
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.array([self.document_to_vector(doc) for doc in X])
    
    def document_to_vector(self, doc):
        vectors = [self.model.wv[word] for word in doc if word in self.model.wv]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(self.model.vector_size)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(docs_clean, labels, test_size=0.5, random_state=42)

# Create pipeline with Word2Vec and Logistic Regression
pipeline = Pipeline([
    ("word2vec", Word2VecTransformer(model)),
    ("clf", LogisticRegression())
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate
acc = pipeline.score(X_test, y_test)
print(f"Test accuracy with Word2Vec: {acc:.2f}")

# Predict on new sentence
new = "That movie was absolutely wonderful and thrilling."
new_clean = preprocess(new)
pred = pipeline.predict([new_clean])[0]
print(f"New sentence prediction (1=pos, 0=neg): {pred}")
```

### 2. **BERT Integration with Pipeline**

Now, let's integrate **BERT** using the Hugging Face `transformers` library into a pipeline. BERT requires tokenization and encoding of text before we can use it for classification. We can create a custom transformer to handle this as well.

```python
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import torch
import numpy as np

# Sample corpus + labels
docs = [
    "I loved the new Batman movie!",
    "The plot was boring and too long.",
    "What a fantastic performance by the lead actors.",
    "I hated the ending, it was terrible."
]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Preprocessing function for BERT (lowercasing)
def preprocess(text):
    return text.lower()

# Tokenization and BERT encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.array([self.encode_with_bert(doc) for doc in X])
    
    def encode_with_bert(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.logits.squeeze().numpy()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=0.5, random_state=42)

# Create pipeline with BERT and Logistic Regression
pipeline = Pipeline([
    ("bert", BertTransformer(model, tokenizer)),
    ("clf", LogisticRegression())
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate
acc = pipeline.score(X_test, y_test)
print(f"Test accuracy with BERT: {acc:.2f}")

# Predict on new sentence
new = "That movie was absolutely wonderful and thrilling."
new_clean = preprocess(new)
pred = pipeline.predict([new_clean])[0]
print(f"New sentence prediction (1=pos, 0=neg): {pred}")
```

### **Explanation**:

* **Word2Vec**: We're using a custom transformer to convert the documents into vectors by averaging word vectors from Word2Vec.
* **BERT**: Similarly, we created a custom transformer that uses BERT to encode each document and then classify it with Logistic Regression.

---

### 🧩 What This Achieves:

* **Word2Vec**: Captures semantic word relationships, ideal for simpler NLP tasks.
* **BERT**: Leverages state-of-the-art, context-aware embeddings for more accurate predictions.

Would you like to test the pipeline with different types of text or need any further adjustments?


Sure! Let's integrate **Word2Vec** and **Transformers** into your pipeline for advanced vectorization. I'll show you how to replace **TF-IDF** with these methods and update the rest of the pipeline accordingly.

### 1. **Word2Vec Integration**

Word2Vec provides dense vector representations of words, capturing semantic meanings. You can replace TF-IDF with Word2Vec, but we'll need to use **Gensim** for Word2Vec.

### **Steps**:

* Use **Word2Vec** to convert each word into a vector.
* Average the word vectors of each document to represent the entire document.

#### Word2Vec Example with your Sentiment Analysis Pipeline

First, let's modify your pipeline to use Word2Vec.

```python
import gensim
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer

# Sample corpus and labels (sentiment task)
docs = [
    "I loved the new Batman movie!",
    "The plot was boring and too long.",
    "What a fantastic performance by the lead actors.",
    "I hated the ending, it was terrible."
]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Preprocessing
stemmer = PorterStemmer()
stop = set(stopwords.words('english'))

def preprocess(text):
    # Lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = [stemmer.stem(tok) for tok in text.split() if tok not in stop]
    return tokens

docs_clean = [preprocess(d) for d in docs]

# Word2Vec training
# Tokenize all documents
all_tokens = [token for doc in docs_clean for token in doc]

# Create Word2Vec model
model = gensim.models.Word2Vec(sentences=docs_clean, vector_size=50, window=5, min_count=1)

# Function to convert document to vector (average word vectors)
def document_to_vector(doc):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Convert documents to vectors
X = np.array([document_to_vector(doc) for doc in docs_clean])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.5, random_state=42)

# Train the Logistic Regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluate
acc = clf.score(X_test, y_test)
print(f"Test accuracy with Word2Vec: {acc:.2f}")

# Predict on new sentence
new = "That movie was absolutely wonderful and thrilling."
new_clean = preprocess(new)
new_vector = document_to_vector(new_clean)
pred = clf.predict([new_vector])[0]
print(f"New sentence prediction (1=pos,0=neg): {pred}")
```

---

### 2. **Transformer Integration (BERT)**

BERT (Bidirectional Encoder Representations from Transformers) is more powerful and context-aware. We can use the **Hugging Face `transformers` library** to integrate a pre-trained BERT model for encoding.

#### Steps for BERT:

* Load a pre-trained BERT model from Hugging Face.
* Tokenize input text and convert it into BERT’s input format.
* Use BERT to generate embeddings for the input text.

### **Example using BERT for Sentiment Classification**:

```python
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.pipeline import Pipeline
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Sample corpus + labels
docs = [
    "I loved the new Batman movie!",
    "The plot was boring and too long.",
    "What a fantastic performance by the lead actors.",
    "I hated the ending, it was terrible."
]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Preprocessing
def preprocess(text):
    return text.lower()

# Use BERT Tokenizer to convert text into input format for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenization and encoding function for BERT input
def encode_with_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.logits.squeeze().numpy()

# Convert documents to BERT embeddings
X = np.array([encode_with_bert(doc) for doc in docs])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.5, random_state=42)

# Train the Logistic Regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluate
acc = clf.score(X_test, y_test)
print(f"Test accuracy with BERT: {acc:.2f}")

# Predict on new sentence
new = "That movie was absolutely wonderful and thrilling."
new_clean = preprocess(new)
new_vector = encode_with_bert(new_clean)
pred = clf.predict([new_vector])[0]
print(f"New sentence prediction (1=pos,0=neg): {pred}")
```

---

### 🧩 Comparison Summary:

| Vectorization Type | Strengths                                | Use Case                                            |
| ------------------ | ---------------------------------------- | --------------------------------------------------- |
| **Word2Vec**       | Captures semantic meaning of words.      | Simple NLP tasks, small datasets.                   |
| **BERT**           | Context-aware embeddings. Very powerful. | Complex NLP tasks, large datasets, better accuracy. |

---

Would you like help setting up a specific part of the pipeline or need more details on any of these vectorizers?
