## Tokenizing

In [11]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
data = pd.read_csv("Sample-data.csv")
first_column = data.iloc[:, 0]
first_column = first_column.astype(str)
combined_text = " ".join(first_column)
tokens = [word for word in word_tokenize(combined_text) if word.isalpha()]
tokens = tokens[:5000]
print(tokens)

['The', 'older', 'you', 'get', 'the', 'less', 'people', 'you', 'trust', 'i', 'dont', 'have', 'time', 'to', 'hate', 'anyone', 'ill', 'just', 'forget', 'about', 'you', 'Never', 'waste', 'your', 'feelings', 'on', 'people', 'who', 'do', 'value', 'them', 'No', 'relationship', 'is', 'ever', 'a', 'waste', 'of', 'time', 'If', 'it', 'did', 'bring', 'you', 'what', 'you', 'want', 'it', 'taught', 'you', 'what', 'you', 'do', 'want', 'When', 'your', 'girlfriend', 'breaks', 'your', 'heart', 'do', 'say', 'a', 'word', 'just', 'smile', 'because', 'she', 'gives', 'you', 'the', 'opportunity', 'to', 'find', 'someone', 'better', 'than', 'her', 'Stop', 'overthinking', 'Relax', 'and', 'let', 'it', 'go', 'i', 'hope', 'in', 'this', 'messed', 'up', 'world', 'you', 'find', 'someone', 'who', 'makes', 'you', 'forget', 'about', 'it', 'all', 'someone', 'who', 'genuinely', 'makes', 'you', 'smile', 'every', 'single', 'day', 'Life', 'does', 'give', 'you', 'what', 'you', 'want', 'It', 'gives', 'you', 'what', 'you', 'work

## Porter Stemming

In [21]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

for token in tokens:
  print(token,"--->", porter.stem(token))

The ---> the
older ---> older
you ---> you
get ---> get
the ---> the
less ---> less
people ---> peopl
you ---> you
trust ---> trust
i ---> i
dont ---> dont
have ---> have
time ---> time
to ---> to
hate ---> hate
anyone ---> anyon
ill ---> ill
just ---> just
forget ---> forget
about ---> about
you ---> you
Never ---> never
waste ---> wast
your ---> your
feelings ---> feel
on ---> on
people ---> peopl
who ---> who
do ---> do
value ---> valu
them ---> them
No ---> no
relationship ---> relationship
is ---> is
ever ---> ever
a ---> a
waste ---> wast
of ---> of
time ---> time
If ---> if
it ---> it
did ---> did
bring ---> bring
you ---> you
what ---> what
you ---> you
want ---> want
it ---> it
taught ---> taught
you ---> you
what ---> what
you ---> you
do ---> do
want ---> want
When ---> when
your ---> your
girlfriend ---> girlfriend
breaks ---> break
your ---> your
heart ---> heart
do ---> do
say ---> say
a ---> a
word ---> word
just ---> just
smile ---> smile
because ---> becaus
she ---> sh

## Snowball Stemming

In [22]:
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')

for token in tokens:
  print(token,"--->", snowball.stem(token))

The ---> the
older ---> older
you ---> you
get ---> get
the ---> the
less ---> less
people ---> peopl
you ---> you
trust ---> trust
i ---> i
dont ---> dont
have ---> have
time ---> time
to ---> to
hate ---> hate
anyone ---> anyon
ill ---> ill
just ---> just
forget ---> forget
about ---> about
you ---> you
Never ---> never
waste ---> wast
your ---> your
feelings ---> feel
on ---> on
people ---> peopl
who ---> who
do ---> do
value ---> valu
them ---> them
No ---> no
relationship ---> relationship
is ---> is
ever ---> ever
a ---> a
waste ---> wast
of ---> of
time ---> time
If ---> if
it ---> it
did ---> did
bring ---> bring
you ---> you
what ---> what
you ---> you
want ---> want
it ---> it
taught ---> taught
you ---> you
what ---> what
you ---> you
do ---> do
want ---> want
When ---> when
your ---> your
girlfriend ---> girlfriend
breaks ---> break
your ---> your
heart ---> heart
do ---> do
say ---> say
a ---> a
word ---> word
just ---> just
smile ---> smile
because ---> becaus
she ---> sh

## Lancaster Stemming

In [23]:
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()

for token in tokens:
  print(token,"--->", lancaster.stem(token))

The ---> the
older ---> old
you ---> you
get ---> get
the ---> the
less ---> less
people ---> peopl
you ---> you
trust ---> trust
i ---> i
dont ---> dont
have ---> hav
time ---> tim
to ---> to
hate ---> hat
anyone ---> anyon
ill ---> il
just ---> just
forget ---> forget
about ---> about
you ---> you
Never ---> nev
waste ---> wast
your ---> yo
feelings ---> feel
on ---> on
people ---> peopl
who ---> who
do ---> do
value ---> valu
them ---> them
No ---> no
relationship ---> rel
is ---> is
ever ---> ev
a ---> a
waste ---> wast
of ---> of
time ---> tim
If ---> if
it ---> it
did ---> did
bring ---> bring
you ---> you
what ---> what
you ---> you
want ---> want
it ---> it
taught ---> taught
you ---> you
what ---> what
you ---> you
do ---> do
want ---> want
When ---> when
your ---> yo
girlfriend ---> girlfriend
breaks ---> break
your ---> yo
heart ---> heart
do ---> do
say ---> say
a ---> a
word ---> word
just ---> just
smile ---> smil
because ---> becaus
she ---> she
gives ---> giv
you ---> y

## Regexp Stemming

In [26]:
from nltk.stem import RegexpStemmer
regexp = RegexpStemmer('es$|d$|ing$|s$|able$', min=4)

for token in tokens:
  print(token,"--->", regexp.stem(token))

The ---> The
older ---> older
you ---> you
get ---> get
the ---> the
less ---> les
people ---> people
you ---> you
trust ---> trust
i ---> i
dont ---> dont
have ---> have
time ---> time
to ---> to
hate ---> hate
anyone ---> anyone
ill ---> ill
just ---> just
forget ---> forget
about ---> about
you ---> you
Never ---> Never
waste ---> waste
your ---> your
feelings ---> feeling
on ---> on
people ---> people
who ---> who
do ---> do
value ---> value
them ---> them
No ---> No
relationship ---> relationship
is ---> is
ever ---> ever
a ---> a
waste ---> waste
of ---> of
time ---> time
If ---> If
it ---> it
did ---> did
bring ---> br
you ---> you
what ---> what
you ---> you
want ---> want
it ---> it
taught ---> taught
you ---> you
what ---> what
you ---> you
do ---> do
want ---> want
When ---> When
your ---> your
girlfriend ---> girlfrien
breaks ---> break
your ---> your
heart ---> heart
do ---> do
say ---> say
a ---> a
word ---> wor
just ---> just
smile ---> smile
because ---> because
she ---