In [8]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

sentence="hi my name is shakil,now i am studying at diu"

import nltk
nltk.download('punkt')
nltk.download('stopwords')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab') # This line is added to download the missing resource.

tokenize=word_tokenize(sentence.lower())

stopwords=set(stopwords.words('english'))
stopwordfilter=[word for word in tokenize if word not in stopwords]

stemmer=PorterStemmer()
stemming=[stemmer.stem(word) for word in stopwordfilter]



print(tokenize)
print(stopwordfilter)
print(stemming)

['hi', 'my', 'name', 'is', 'shakil', ',', 'now', 'i', 'am', 'studying', 'at', 'diu']
['hi', 'name', 'shakil', ',', 'studying', 'diu']
['hi', 'name', 'shakil', ',', 'studi', 'diu']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Let's break down your **text preprocessing code** step by step so you fully understand what each line does. This is a very common and useful function in NLP to clean raw text before analysis.

---

### 🔢 Full Code:

```python
stemmer = PorterStemmer()
stops = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = [
        stemmer.stem(tok)
        for tok in text.split()
        if tok not in stops
    ]
    return " ".join(tokens)

docs_clean = [preprocess(d) for d in docs]
```

---

## ✅ Line-by-Line Explanation

---

### 🔸 `stemmer = PorterStemmer()`

* Creates a **PorterStemmer object**.
* The **Porter Stemmer** is a tool that **reduces words to their root form**.

  * Example: `"running"` → `"run"`, `"flies"` → `"fli"`

---

### 🔸 `stops = set(stopwords.words('english'))`

* Loads **English stopwords** from `nltk.corpus.stopwords`.
* Converts them into a `set` for **fast lookup**.
* Stopwords are common words like `"is"`, `"the"`, `"and"` — they usually **don't add meaning** and are often removed.

---

### 🔸 `def preprocess(text):`

* This defines a function called `preprocess` that takes in raw `text` and returns cleaned text.

---

### 1️⃣ `text = text.lower()`

* Converts the entire text to **lowercase** to make analysis **case-insensitive**.
* Example: `"Hello World"` → `"hello world"`

---

### 2️⃣ `text = re.sub(r"[^\w\s]", "", text)`

* Removes **punctuation** using a **regular expression**.
* `r"[^\w\s]"` matches any character that is **not a word character** or **whitespace**.
* Example: `"Hello, world!"` → `"Hello world"`

---

### 3️⃣ Tokenize + Remove Stopwords + Stem:

```python
tokens = [
    stemmer.stem(tok)
    for tok in text.split()
    if tok not in stops
]
```

* **`text.split()`** splits the sentence into **individual words (tokens)** by spaces.
* `if tok not in stops`: removes stopwords.
* `stemmer.stem(tok)`: applies stemming to each word.

📌 Example:

```python
text = "Cats are running in the garden."
After stopword removal → ['Cats', 'running', 'garden']
After stemming → ['cat', 'run', 'garden']
```

---

### 4️⃣ `return " ".join(tokens)`

* Joins the cleaned tokens back into a single string.

---

### 🔁 `docs_clean = [preprocess(d) for d in docs]`

* Applies the `preprocess()` function to **every document** in the list `docs`.
* Returns a list of cleaned text strings.

---

### 🧠 Summary of Steps in Preprocessing:

| Step               | Purpose                     |
| ------------------ | --------------------------- |
| Lowercase          | Make text uniform           |
| Remove punctuation | Eliminate noise             |
| Tokenize           | Break into words            |
| Remove stopwords   | Eliminate unimportant words |
| Stem               | Reduce to word roots        |
| Rejoin tokens      | Return cleaned sentence     |

---

Would you like a diagram or want to see this function applied to a real example text?


In [None]:
import re #re (Regular Expressions) in NLP Preprocessing?
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# 1. Sample corpus + labels (for a simple sentiment task)
docs = [
    "I loved the new Batman movie!",
    "The plot was boring and too long.",
    "What a fantastic performance by the lead actors.",
    "I hated the ending, it was terrible."
]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative


# 2. Text Preprocessing
stemmer = PorterStemmer()
stops = set(stopwords.words('english'))

def preprocess(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # tokenize + remove stopwords + stem
    tokens = [
        stemmer.stem(tok)
        for tok in text.split()
        if tok not in stops
    ]
    return " ".join(tokens)

docs_clean = [preprocess(d) for d in docs]

In [None]:
import re
from nltk.stem import PortrerStemmer
from  nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

text="hi my name is shakil,now i am studying at diu"

stemm=PorterStemmer()
stop=set(stopwords.words('english'))

def preprocess(text):
  text=text.lower()
  text=re.sub(r"[^\\w\s)"," ",text)
  tokens=[stemm.stem(word) for word in text.split() if word not in stop]
  return " ".join(tokens)

docs_clean=[preprocess(word) for word in text]


Great! Here's an upgraded version of your `preprocess()` function that includes:

✅ **Lowercasing**
✅ **Punctuation removal**
✅ **Stopword removal**
✅ **Tokenization**
✅ **Number removal**
✅ **Extra space cleanup**
✅ **Lemmatization** (instead of stemming — more accurate)

---

### 🧠 Requirements

We’ll use `WordNetLemmatizer`, so we must download a few NLTK resources:

```python
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
```

---

### ✅ Full Preprocessing Function

```python
def preprocess(text):
    # Lowercase
    text = text.lower()
    
    # Remove URLs (optional for web data)
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return " ".join(lemmatized)
```

---

### ✅ Test It

```python
text = "Hi! My name is Shakil. Now I'm studying at DIU since 2022."
print(preprocess(text))
```

---

### 🔍 Output Example

```text
hi name shakil studying diu
```

(Assuming "diu" is not removed — you can manually add it to `stop_words` if needed.)

---

### 🛠 Optional: Add Custom Stopwords

```python
stop_words.update(["diu", "name"])
```

---

Let me know if you’d like to apply this to a **list of sentences**, build a **vectorizer**, or feed it into a model next!


Excellent! Let's walk through **Optional Advanced NLP Preprocessing** in detail with working code examples:

---

## 🔹 1. **POS Tagging (Part-of-Speech Tagging)**

We can use POS tagging to keep only **nouns**, **verbs**, etc., if we want to focus on content-heavy words.

### ✅ Example: Keep Only Nouns and Verbs

```python
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger')

def filter_by_pos(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    # Keep only nouns and verbs (NN*, VB*)
    filtered = [word for word, tag in tagged if tag.startswith("NN") or tag.startswith("VB")]
    return filtered

text = "Shakil is learning natural language processing at DIU."
print(filter_by_pos(text))  # ['Shakil', 'learning', 'language', 'processing', 'DIU']
```

---

## 🔹 2. **Named Entity Recognition (NER)**

NER identifies **names, places, organizations, dates**, etc.

### ✅ Example: Extract Named Entities

```python
from nltk import ne_chunk
from nltk.tree import Tree

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    chunked = ne_chunk(tagged)

    named_entities = []
    for chunk in chunked:
        if isinstance(chunk, Tree):
            named_entity = " ".join(c[0] for c in chunk)
            named_entities.append(named_entity)
    return named_entities

text = "Shakil studies at Daffodil International University in Dhaka."
print(extract_named_entities(text))  # ['Shakil', 'Daffodil International University', 'Dhaka']
```

You can also **remove** them by filtering these out from your tokens.

---

## 🔹 3. **TF-IDF Vectorization**

TF-IDF (Term Frequency-Inverse Document Frequency) is a technique to convert text into numerical form **based on importance**, not just frequency.

### ✅ Example with `TfidfVectorizer`

```python
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "I love programming in Python",
    "Python is great for natural language processing",
    "Natural language processing is fun"
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Print TF-IDF feature names and values
print(vectorizer.get_feature_names_out())
print(X.toarray())
```

---

### 🧠 Summary Table

| Feature        | Library                   | Use Case                                |
| -------------- | ------------------------- | --------------------------------------- |
| POS Tagging    | `nltk.pos_tag()`          | Keep only nouns/verbs/etc.              |
| Named Entities | `nltk.ne_chunk()`         | Extract or remove proper names          |
| TF-IDF Vector  | `sklearn.TfidfVectorizer` | Convert clean text into numeric vectors |

---

Would you like me to **combine these in one full pipeline**, from text to vector form, ready for machine learning?


In [None]:
import nltk
from nltk import pos_tag, word_tokenize, ne_chunk, CFG, ChartParser
from nltk.tree import Tree
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK models
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# -------- 1. POS Tagging: Keep Only Nouns and Verbs -------- #
def filter_by_pos(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    filtered = [word for word, tag in tagged if tag.startswith("NN") or tag.startswith("VB")]
    return filtered

# -------- 2. Named Entity Recognition (NER) -------- #
def extract_named_entities(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    chunked = ne_chunk(tagged)
    named_entities = []
    for chunk in chunked:
        if isinstance(chunk, Tree):
            named_entity = " ".join(c[0] for c in chunk)
            named_entities.append(named_entity)
    return named_entities

# -------- 3. Constituency Parsing (Simple Grammar Example) -------- #
def parse_sentence(text):
    # Define a mini grammar that matches your example
    grammar = CFG.fromstring("""
        S -> NP VP
        NP -> Det N | N
        VP -> V NP | V
        Det -> 'the' | 'a'
        N -> 'Shakil' | 'language' | 'processing' | 'university'
        V -> 'studies' | 'is'
    """)
    parser = ChartParser(grammar)

    tokens = word_tokenize(text)
    for tree in parser.parse(tokens):
        tree.pretty_print()
        return tree  # Return first tree

# -------- 4. TF-IDF Vectorization -------- #
def tfidf_vectorize(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    print("Features:", vectorizer.get_feature_names_out())
    print("TF-IDF Matrix:\n", X.toarray())
    return X

# ---------- Example Usage ---------- #
text = "Shakil studies at Daffodil International University in Dhaka."

print("\n1️⃣ POS Filtered Tokens:")
print(filter_by_pos(text))

print("\n2️⃣ Named Entities:")
print(extract_named_entities(text))

print("\n3️⃣ Parse Tree:")
parse_sentence("Shakil studies language")  # Keep this simple to match the grammar

print("\n4️⃣ TF-IDF Vectorization:")
corpus = [
    "Shakil studies language",
    "Natural language processing is amazing",
    "Shakil loves Python"
]
tfidf_vectorize(corpus)
