In [12]:
import gradio as gr
import joblib as jb
import numpy as np
import pandas as pd
import pickle as pk
import seaborn as sns
import sklearn.cluster as cluster
import warnings
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from IPython.display import Image
from kneed import KneeLocator
from matplotlib import pyplot as plt
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from pandas.core.dtypes.common import is_numeric_dtype
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split as tts
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings("ignore")

# Stemmer

In [13]:
port = PorterStemmer()

In [14]:
words = ['improve', 'improving', 'improvements', 'improved', 'improver']
words

['improve', 'improving', 'improvements', 'improved', 'improver']

In [15]:
for word in words:
    print('Before = ', word)
    print('After = ', port.stem(word))

Before =  improve
After =  improv
Before =  improving
After =  improv
Before =  improvements
After =  improv
Before =  improved
After =  improv
Before =  improver
After =  improv


In [16]:
port.stem('physics')

'physic'

In [17]:
port.stem('change')

'chang'

In [18]:
port.stem('gone')

'gone'

# Lemmatization

In [19]:
lem = WordNetLemmatizer()

In [20]:
for word in words:
    print('Before = ', word)
    print('After = ', lem.lemmatize(word))

Before =  improve
After =  improve
Before =  improving
After =  improving
Before =  improvements
After =  improvement
Before =  improved
After =  improved
Before =  improver
After =  improver


In [21]:
sentence = "Stemming and Lemmatization are Text Normalization (or sometimes called Word Normalization) techniques in  the field of Natural Language Processing that are used to prepare text, words, and documents for further  processing."

In [22]:
import nltk
from nltk.tokenize import word_tokenize

In [24]:
sen = word_tokenize(sentence)
sen

['Stemming',
 'and',
 'Lemmatization',
 'are',
 'Text',
 'Normalization',
 '(',
 'or',
 'sometimes',
 'called',
 'Word',
 'Normalization',
 ')',
 'techniques',
 'in',
 'the',
 'field',
 'of',
 'Natural',
 'Language',
 'Processing',
 'that',
 'are',
 'used',
 'to',
 'prepare',
 'text',
 ',',
 'words',
 ',',
 'and',
 'documents',
 'for',
 'further',
 'processing',
 '.']

In [25]:
for word in sen:
    print(lem.lemmatize(word))

Stemming
and
Lemmatization
are
Text
Normalization
(
or
sometimes
called
Word
Normalization
)
technique
in
the
field
of
Natural
Language
Processing
that
are
used
to
prepare
text
,
word
,
and
document
for
further
processing
.


In [26]:
for word in sen:
    print(port.stem(word))

stem
and
lemmat
are
text
normal
(
or
sometim
call
word
normal
)
techniqu
in
the
field
of
natur
languag
process
that
are
use
to
prepar
text
,
word
,
and
document
for
further
process
.


In [27]:
from nltk.corpus import stopwords, words
import string

In [28]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [29]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [30]:
len(stopwords.words('english'))

179

In [31]:
stopwords.words('bengali')

['অতএব',
 'অথচ',
 'অথবা',
 'অনুযায়ী',
 'অনেক',
 'অনেকে',
 'অনেকেই',
 'অন্তত',
 'অন্য',
 'অবধি',
 'অবশ্য',
 'অর্থাত',
 'আই',
 'আগামী',
 'আগে',
 'আগেই',
 'আছে',
 'আজ',
 'আদ্যভাগে',
 'আপনার',
 'আপনি',
 'আবার',
 'আমরা',
 'আমাকে',
 'আমাদের',
 'আমার',
 'আমি',
 'আর',
 'আরও',
 'ই',
 'ইত্যাদি',
 'ইহা',
 'উচিত',
 'উত্তর',
 'উনি',
 'উপর',
 'উপরে',
 'এ',
 'এঁদের',
 'এঁরা',
 'এই',
 'একই',
 'একটি',
 'একবার',
 'একে',
 'এক্',
 'এখন',
 'এখনও',
 'এখানে',
 'এখানেই',
 'এটা',
 'এটাই',
 'এটি',
 'এত',
 'এতটাই',
 'এতে',
 'এদের',
 'এব',
 'এবং',
 'এবার',
 'এমন',
 'এমনকী',
 'এমনি',
 'এর',
 'এরা',
 'এল',
 'এস',
 'এসে',
 'ঐ',
 'ও',
 'ওঁদের',
 'ওঁর',
 'ওঁরা',
 'ওই',
 'ওকে',
 'ওখানে',
 'ওদের',
 'ওর',
 'ওরা',
 'কখনও',
 'কত',
 'কবে',
 'কমনে',
 'কয়েক',
 'কয়েকটি',
 'করছে',
 'করছেন',
 'করতে',
 'করবে',
 'করবেন',
 'করলে',
 'করলেন',
 'করা',
 'করাই',
 'করায়',
 'করার',
 'করি',
 'করিতে',
 'করিয়া',
 'করিয়ে',
 'করে',
 'করেই',
 'করেছিলেন',
 'করেছে',
 'করেছেন',
 'করেন',
 'কাউকে',
 'কাছ',
 'কাছে',
 'কাজ',
 'কাজে',
 'কারও',
 '

In [32]:
len(stopwords.words('bengali'))

398

In [33]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']