# Bag of words

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

# Bag of words = unordered collection of words
class Category:
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"


train_x = ["I love book", "This is a great book", "The fit is great", "I love the shoes"]
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]


In [57]:
vectorizer = CountVectorizer(binary=True)
train_x_vectorizer = vectorizer.fit_transform(train_x)

In [58]:
print(vectorizer.get_feature_names_out())
print(train_x_vectorizer.toarray())

['book' 'fit' 'great' 'is' 'love' 'shoes' 'the' 'this']
[[1 0 0 0 1 0 0 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


In [59]:
model = SVC(kernel='linear')
model.fit(train_x_vectorizer, train_y)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'linear'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [60]:
test_x = ["I like the books"]
model.predict(vectorizer.transform(test_x))

array(['CLOTHING'], dtype='<U8')

In [61]:
newsample = ["I like shoes"]
model.predict(vectorizer.transform(newsample))

array(['CLOTHING'], dtype='<U8')

# Word Vector 

#### Words with similar meanings have similar vectors


In [62]:
# pip install spacy
# python -m spacy download en_core_web_md

import spacy

nlp = spacy.load("en_core_web_md")

In [63]:
print(train_x)

['I love book', 'This is a great book', 'The fit is great', 'I love the shoes']


In [64]:
docs = [nlp(text) for text in train_x]
train_x_words_vectors = [x.vector for x in docs]
print(train_x_words_vectors)

[array([-7.59866655e-01, -8.65733325e-02, -2.36234322e-01, -4.07900028e-02,
        3.26636434e-03,  2.26409987e-01,  2.48783350e-01, -3.18826646e-01,
        1.48779675e-01,  1.44142675e+00, -2.59966344e-01, -4.54330035e-02,
        1.25833005e-01,  3.11036780e-03, -6.60644174e-02, -1.34342328e-01,
       -1.05106674e-01,  6.01831675e-01, -9.59333405e-02, -1.86500046e-02,
        2.24139988e-01,  8.69700015e-02,  1.49200335e-01, -1.38440326e-01,
       -1.81801006e-01, -1.41644672e-01, -3.60336661e-01, -3.52530003e-01,
        1.97396651e-01,  3.45166735e-02,  4.48643304e-02,  2.58956671e-01,
       -3.49436671e-01,  9.66536701e-02,  6.80873320e-02,  5.06386645e-02,
        3.68050002e-02,  2.06948325e-01, -5.72333448e-02,  1.86753318e-01,
       -1.88399985e-01, -2.45233327e-02, -2.08648682e-01, -6.27800003e-02,
        6.21700101e-02,  2.52310663e-01,  7.71109983e-02,  3.57716680e-01,
       -7.36203268e-02,  1.77761003e-01, -2.18958661e-01,  9.10153389e-02,
       -2.16430664e-01, 

In [65]:
from sklearn import svm

model_svm = svm.SVC(kernel='linear')
model_svm.fit(train_x_words_vectors, train_y)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'linear'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [66]:
test_x = ['I like shoes and harry potter']
test_docs = [nlp(test) for test in test_x]
test_x_words_vectors = [x.vector for x in test_docs]
model_svm.predict(test_x_words_vectors)

array(['CLOTHING'], dtype='<U8')

In [67]:
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\subin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\subin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\subin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\subin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\subin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [68]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = "Reading the books"
words = word_tokenize(phrase)

stemmed_words = []

for word in words:
    stemmed_words.append(stemmer.stem(word))

" ".join(stemmed_words)

'read the book'

In [74]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = "reading the books"
words = word_tokenize(phrase)

lemmatized_words = []

for word in words:
    lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(lemmatized_words)



'read the book'