# Stemming 

In [1]:
import nltk

### 1. Porter =Stemmer

In [2]:
from nltk.stem import PorterStemmer

### Creating an object of PorterStemmer

In [4]:
p_stemmer = PorterStemmer()
words = ['Hello','Printer','Printing','Print','Portfolio','Run','Runner','Running']
for word in words:
    print(word+'------->'+p_stemmer.stem(word))

Hello------->hello
Printer------->printer
Printing------->print
Print------->print
Portfolio------->portfolio
Run------->run
Runner------->runner
Running------->run


### Why stemming has some issues

In [7]:
p_stemmer.stem('congratulations')

'congratul'

In [8]:
p_stemmer.stem('celebrations')

'celebr'

#### We can see it doesn't work for all the words , therefore it sometimes is not able to create meaningful stems out of words

### 2. RegexpStemmer

In [9]:
from nltk.stem import RegexpStemmer

In [40]:
words = ['Hello','Printer','Printing','Print','Prints','Portfolio','Run','Runner','Running','Runs']
pattern = "ing$|s$|e$|er$|ner$|ning$"
reg_stem = RegexpStemmer(pattern,min=4)
for word in words:
    print(word+'----->'+reg_stem.stem(word))

Hello----->Hello
Printer----->Print
Printing----->Print
Print----->Print
Prints----->Print
Portfolio----->Portfolio
Run----->Run
Runner----->Run
Running----->Run
Runs----->Run


## Regex (Regular Expression)

In [11]:
import re

##### it works on 2 things (patterns and text)

In [15]:
pattern = "very"
text = ''' Simple Model (High Bias, Low Variance): A straight line might miss the true curve of the data, 
but it will not change much with different data sets.
Complex Model (Low Bias, High Variance): A very wiggly line might fit the data points exactly, 
but small changes in the data can lead to a completely different wiggly line.
Real-World Analogy
Think of bias and variance in terms of studying for an exam:

High Bias (Simple Model): If you only study a few basic concepts, your answers will be consistently wrong for complex questions (underfitting),
but you'll give similar answers every time (low variance).
High Variance (Complex Model): If you try to memorize every possible detail and example, you might do well on the practice tests (low bias), 
but your performance will vary a lot depending on the exact questions you get on the real exam (high variance).
'''

#### re.search() -> for getting the 1st occurence or getting whether pattern present in text or not

In [18]:
match = re.search(pattern,text)
match

<re.Match object; span=(194, 198), match='very'>

In [20]:
pattern = r"[A-Z]omplex"
re.search(pattern,text)

<re.Match object; span=(151, 158), match='Complex'>

#### re.finditer() -> for getting all the occurences

In [30]:
pattern = r"[A-Z]"

In [31]:
matches = re.finditer(pattern,text)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='S'>
<re.Match object; span=(8, 9), match='M'>
<re.Match object; span=(15, 16), match='H'>
<re.Match object; span=(20, 21), match='B'>
<re.Match object; span=(26, 27), match='L'>
<re.Match object; span=(30, 31), match='V'>
<re.Match object; span=(41, 42), match='A'>
<re.Match object; span=(151, 152), match='C'>
<re.Match object; span=(159, 160), match='M'>
<re.Match object; span=(166, 167), match='L'>
<re.Match object; span=(170, 171), match='B'>
<re.Match object; span=(176, 177), match='H'>
<re.Match object; span=(181, 182), match='V'>
<re.Match object; span=(192, 193), match='A'>
<re.Match object; span=(325, 326), match='R'>
<re.Match object; span=(330, 331), match='W'>
<re.Match object; span=(336, 337), match='A'>
<re.Match object; span=(344, 345), match='T'>
<re.Match object; span=(406, 407), match='H'>
<re.Match object; span=(411, 412), match='B'>
<re.Match object; span=(417, 418), match='S'>
<re.Match object; span=(424, 425), match='M'>
<re.Ma

In [33]:
pattern = r"[A-Z][a-z]*"
matches = re.finditer(pattern,text)
for match in matches:
    print(match)

<re.Match object; span=(1, 7), match='Simple'>
<re.Match object; span=(8, 13), match='Model'>
<re.Match object; span=(15, 19), match='High'>
<re.Match object; span=(20, 24), match='Bias'>
<re.Match object; span=(26, 29), match='Low'>
<re.Match object; span=(30, 38), match='Variance'>
<re.Match object; span=(41, 42), match='A'>
<re.Match object; span=(151, 158), match='Complex'>
<re.Match object; span=(159, 164), match='Model'>
<re.Match object; span=(166, 169), match='Low'>
<re.Match object; span=(170, 174), match='Bias'>
<re.Match object; span=(176, 180), match='High'>
<re.Match object; span=(181, 189), match='Variance'>
<re.Match object; span=(192, 193), match='A'>
<re.Match object; span=(325, 329), match='Real'>
<re.Match object; span=(330, 335), match='World'>
<re.Match object; span=(336, 343), match='Analogy'>
<re.Match object; span=(344, 349), match='Think'>
<re.Match object; span=(406, 410), match='High'>
<re.Match object; span=(411, 415), match='Bias'>
<re.Match object; span=(4

### 3. SnowballStemmer -> more powerful than PorterStemmer

In [47]:
from nltk.stem import SnowballStemmer
sb_stemmer=SnowballStemmer("english")
for word in words:
    print(word+"----------->"+sb_stemmer.stem(word))

Hello----------->hello
Printer----------->printer
Printing----------->print
Print----------->print
Prints----------->print
Portfolio----------->portfolio
Run----------->run
Runner----------->runner
Running----------->run
Runs----------->run


In [48]:
sb_stemmer.stem('Congratulations')

'congratul'

#### Even SnowballStemmer has issues 