In [1]:
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
punct = [word for word in string.punctuation]
print(punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [3]:
stop_words = list(ENGLISH_STOP_WORDS)
print(stop_words)

['across', 'themselves', 'front', 'therefore', 'below', 'first', 'though', 'both', 'find', 'should', 'thence', 'one', 'get', 'take', 'top', 'interest', 'fill', 'over', 'you', 'above', 'eleven', 'those', 'whoever', 'her', 'what', 'may', 'until', 'often', 'amongst', 'this', 'has', 'seem', 'few', 'because', 'must', 'me', 'nothing', 'through', 'ours', 'further', 'could', 'so', 'etc', 'hers', 'empty', 'back', 'ever', 'give', 'made', 'him', 'its', 'no', 'than', 'part', 'are', 'mine', 'fire', 'within', 'eg', 'everyone', 'might', 'sixty', 'therein', 'seems', 'side', 'somehow', 'own', 'latter', 'twelve', 'become', 'something', 'too', 'hereby', 'otherwise', 'already', 'ten', 'myself', 'their', 'where', 'amoungst', 'am', 'anywhere', 'forty', 'more', 'onto', 'almost', 'becomes', 'mill', 'becoming', 'wherever', 'whereupon', 'itself', 'whereby', 'being', 'very', 'himself', 'afterwards', 'most', 'however', 'mostly', 'thus', 'that', 'all', 'whose', 'fifty', 'couldnt', 'after', 'although', 'whole', 'if

In [4]:
corpus = [
    "John like horror movie.",
   "Ryan watches movie and dramatic movies."
]
corpus

['John like horror movie.', 'Ryan watches movie and dramatic movies.']

In [5]:
pre_proc_vocab = []
for doc in corpus:
    doc_as_list = doc.split(" ")
    for word in doc_as_list:
        word_lower = str(word).lower().replace('.','') 
        if word_lower not in stop_words:
            pre_proc_vocab.append(word_lower)
            
print(pre_proc_vocab)

['john', 'like', 'horror', 'movie', 'ryan', 'watches', 'movie', 'dramatic', 'movies']


In [6]:
corpus

['John like horror movie.', 'Ryan watches movie and dramatic movies.']

In [7]:
pd.set_option('display.max_columns',70)

#### **Applying Count Vectoriser**

In [60]:
cv = CountVectorizer(binary=True,analyzer='word',ngram_range=(1,3),max_df=1)
cv.fit(corpus)

CountVectorizer(binary=True, max_df=1, ngram_range=(1, 3))

In [61]:
cv_corp_results = cv.transform(corpus)
cv_corp_results

<2x22 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [62]:
pd.DataFrame(cv_corp_results.todense(),columns=cv.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1


#### **Applying Tf-IDF Vectoriser**

In [67]:
tf = TfidfVectorizer(ngram_range=(1,2),analyzer='word',binary=False,max_df=2)
tf.fit(corpus)

TfidfVectorizer(max_df=2, ngram_range=(1, 2))

In [68]:
tf.vocabulary_, tf.idf_,

({'john': 6,
  'like': 8,
  'horror': 4,
  'movie': 10,
  'john like': 7,
  'like horror': 9,
  'horror movie': 5,
  'ryan': 13,
  'watches': 15,
  'and': 0,
  'dramatic': 2,
  'movies': 12,
  'ryan watches': 14,
  'watches movie': 16,
  'movie and': 11,
  'and dramatic': 1,
  'dramatic movies': 3},
 array([1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
        1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
        1.        , 1.40546511, 1.40546511, 1.40546511, 1.40546511,
        1.40546511, 1.40546511]))

In [69]:
tfidf_corp_results = tf.transform(corpus)
tfidf_corp_results

<2x17 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [70]:
pd.DataFrame(tfidf_corp_results.toarray(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,dramatic,dramatic movies,horror,horror movie,john,john like,like,like horror,movie,movie and,movies,ryan,ryan watches,watches,watches movie
0,0.0,0.0,0.0,0.0,0.392044,0.392044,0.392044,0.392044,0.392044,0.392044,0.278943,0.0,0.0,0.0,0.0,0.0,0.0
1,0.308515,0.308515,0.308515,0.308515,0.0,0.0,0.0,0.0,0.0,0.0,0.219511,0.308515,0.308515,0.308515,0.308515,0.308515,0.308515


In [63]:
tf = TfidfVectorizer(ngram_range=(1,2),analyzer='word',binary=False,min_df=2)
tf.fit(corpus)

TfidfVectorizer(min_df=2, ngram_range=(1, 2))

In [64]:
tf.vocabulary_, tf.idf_,

({'movie': 0}, array([1.]))

In [65]:
tfidf_corp_results = tf.transform(corpus)
tfidf_corp_results

<2x1 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [66]:
pd.DataFrame(tfidf_corp_results.toarray(),columns=tf.get_feature_names())

Unnamed: 0,movie
0,1.0
1,1.0


#### **Applying Tf-IDF Transformer**

In [198]:
tf_idf_tr = TfidfTransformer(sublinear_tf=True)
tf_idf_tr.fit(cv_corp_results)

TfidfTransformer(sublinear_tf=True)

In [199]:
tfidf_transformer_results = tf_idf_tr.transform(cv_corp_results)
tfidf_transformer_results

<2x23 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [200]:
pd.DataFrame(tfidf_transformer_results.todense(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0.0,0.0,0.0,0.0,0.0,0.342871,0.342871,0.342871,0.342871,0.342871,0.342871,0.342871,0.342871,0.243956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.262556,0.262556,0.262556,0.262556,0.262556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186811,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556


#### **On Unseen Dataset**

In [201]:
test_data = ["his name is horror and he likes horror movies.",
            "i like working on python-like languages."] 

In [202]:
tt = pd.DataFrame(cv.transform(test_data).todense(),columns=cv.get_feature_names())
tt

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [203]:
pd.DataFrame(tf.transform(test_data).todense(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
pd.DataFrame(tf_idf_tr.transform(tt).todense(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **`L2, L1 and L-infinity normalization manaully`**

In [3]:
import numpy as np

#### **`L-2 norm`**

In [140]:
X = np.array([[-1,0,1],[40,30,100]])

In [7]:
X

array([[-1,  0,  1],
       [ 0,  1,  2]])

In [10]:
X_square = X * X
X_square

array([[1, 0, 1],
       [0, 1, 4]])

In [17]:
X_sqrt = np.sqrt((X * X).sum(axis=1))         ## L2-norm
X_sqrt

array([1.41421356, 2.23606798])

In [66]:
X_l2_nomrd = X / X_sqrt.reshape(2,1)          ## Row normalization i.e. the length of the row vectors are 1
X_l2_nomrd

array([[-0.70710678,  0.        ,  0.70710678],
       [ 0.        ,  0.4472136 ,  0.89442719]])

##### **Length or Magnitude of first row vector**

In [71]:
np.round(np.square(X_l2_nomrd[0]).sum(),3)

1.0

##### **Length or Magnitude of second row vector**

In [72]:
np.round(np.square(X_l2_nomrd[1]).sum(),3)

1.0

#### **`L-1 norm`**

In [29]:
X_abs_sum = np.abs(X).sum(axis=1)            ## L1-norm
X_abs_sum

array([2, 3])

In [73]:
X_l1_nomrd = X / X_abs_sum.reshape(2,1)          ## Row normalization i.e. the length of the row vectors are 1
X_l1_nomrd

array([[-0.5       ,  0.        ,  0.5       ],
       [ 0.        ,  0.33333333,  0.66666667]])

##### **Length or Magnitude of first row vector**

In [75]:
np.round(np.abs(X_l1_nomrd[0]).sum(),3)

1.0

##### **Length or Magnitude of second row vector**

In [76]:
np.round(np.abs(X_l1_nomrd[1]).sum(),3)

1.0

#### **`L-infinity norm`**

In [32]:
X_abs_inf_sum = np.abs(X).max(axis=1)            ## L1-norm
X_abs_inf_sum

array([1, 2])

In [77]:
X_inf_normd = X / X_abs_inf_sum.reshape(2,1)          ## Row normalization i.e. the length of the row vectors are 1
X_inf_normd

array([[-1. ,  0. ,  1. ],
       [ 0. ,  0.5,  1. ]])

##### **Length or Magnitude of first row vector**

In [79]:
np.round(np.abs(X_inf_normd[0]).max(),3)

1.0

##### **Length or Magnitude of second row vector**

In [80]:
np.round(np.abs(X_inf_normd[1]).max(),3)

1.0

## **`L2, L1 and L-infinity normalization using numpy package`**

In [20]:
from numpy.linalg import norm

#### **`L-2 norm`**

In [22]:
l2_norm = norm(X,ord=2,axis=1)
l2_norm

array([1.41421356, 2.23606798])

In [30]:
X / l2_norm.reshape(2,1)         ## Row normalization i.e. the length of the row vectors are 1

array([[-0.70710678,  0.        ,  0.70710678],
       [ 0.        ,  0.4472136 ,  0.89442719]])

#### **`L-1 norm`**

In [24]:
l1_norm = norm(X,ord=1,axis=1)
l1_norm

array([2., 3.])

In [25]:
X / l1_norm.reshape(2,1)         ## Row normalization i.e. the length of the row vectors are 1

array([[-0.5       ,  0.        ,  0.5       ],
       [ 0.        ,  0.33333333,  0.66666667]])

#### **`L-infinity norm`**

In [26]:
linf_norm = norm(X,ord=np.inf,axis=1)
linf_norm

array([1., 2.])

In [27]:
X / linf_norm.reshape(2,1)         ## Row normalization i.e. the length of the row vectors are 1

array([[-1. ,  0. ,  1. ],
       [ 0. ,  0.5,  1. ]])

#### **Standardizing the data using Standard Scaler**

In [141]:
from sklearn.preprocessing import StandardScaler

In [142]:
SS = StandardScaler()

In [143]:
X

array([[ -1,   0,   1],
       [ 40,  30, 100]])

In [144]:
X_mean, X_std = np.mean(X,axis=0), np.std(X,axis=0)
X_mean, X_std

(array([19.5, 15. , 50.5]), array([20.5, 15. , 49.5]))

In [145]:
(-1 -(-0.5))/0.5

-1.0

In [146]:
X_ss = SS.fit_transform(X)
X_ss

array([[-1., -1., -1.],
       [ 1.,  1.,  1.]])

In [136]:
SS.mean_

array([-0.5,  0.5, 50.5])

In [137]:
SS.var_

array([2.50000e-01, 2.50000e-01, 2.45025e+03])

In [138]:
X

array([[ -1,   0,   1],
       [  0,   1, 100]])

In [139]:
l2_norm

array([1.41421356, 2.23606798])

### **Now, the difference between `Standard Scaling [-1,1]` means 0 mean and 1 standard deviation and `Normalization [0-1 scaling]` is that the length or magnitude of row vectors in case of normalization will be equals to 1 i.e. we are shrinking the size of all the vectors. Whereas, in the case of Standard Scaling the length of row vectors equals to 1 is not guaranteed.**

### **Therefore, Row vector normalization is used when you are working with Cosine Similarities because cosine similarity doesn't care about the magnitude of the vectors and it only cares about the angle between them and shrinking the length of vectors doesn't impact the angles between them.**

### **Here we mean to say that when you apply `normalization[0-1 scaling]`, then the component values of all the dimensions come into [0,1], there by the total length of the vector will be atmost 1 unit and all the vectors will lie within a hyper-sphere of radius 1 as the maximum value for any feature woule become 1. This way we can easily interpret when we have all the vectors within a hypersphere(n-dimensional) or circle in 2-d or sphere in 3-d of radius 1 unit.**

In [57]:
X_ss_l2_norm = norm(x=X_ss,ord=2,axis=1).reshape(2,1)
X_ss_l2_norm

array([[1.73205081],
       [1.73205081]])

In [82]:
X_ss_l2_normd = X_ss/X_ss_l2_norm
X_ss_l2_normd

array([[-0.57735027, -0.57735027, -0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027]])

##### **Length or Magnitude of first row vector**

In [83]:
np.round(np.square(X_ss_l2_normd[0]).sum(),3)

1.0

##### **Length or Magnitude of second row vector**

In [84]:
np.round(np.square(X_ss_l2_normd[1]).sum(),3)

1.0

In [81]:
X_l2_nomrd         ## Row normalization i.e. the length of the row vectors are 1

array([[-0.70710678,  0.        ,  0.70710678],
       [ 0.        ,  0.4472136 ,  0.89442719]])

##### **Length or Magnitude of first row vector**

In [85]:
np.round(np.square(X_l2_nomrd[0]).sum(),3)

1.0

##### **Length or Magnitude of second row vector**

In [86]:
np.round(np.square(X_l2_nomrd[1]).sum(),3)

1.0

### **Above, experiment shows us that the row normalization also shrinks or stretches the length of the row vectors as 1.**

### **Cosine similarity between two vectors depends on the angle between them and it doesn't depend upon the geometrical distance. Also cosine similarity treats two vectors as same if they both are pointing in the same direction from origin irrespective of the exact position in the n-dimensional space. We use cosine similarity only to check how do two vectors differ angularly. So it is recommended to bring all the data points into a same radius from the point and then compute cosine similarity. For this you need to perform normalization(here it is 0-1 scaling) on the rows of the data and then use cosine similarity.**

## **`The Big Question – Normalize or Standardize?`**

- #### **Normalization is good to use when you know that the distribution of your data does not follow a Gaussian distribution. This can be useful in algorithms that do not assume any distribution of the data like K-Nearest Neighbors and Neural Networks. Standardization, on the other hand, can be helpful in cases where the data follows a Gaussian distribution. However, this does not have to be necessarily true.** 

- #### **At the end of the day, the choice of using normalization or standardization will depend on your problem and the machine learning algorithm you are using. There is no hard and fast rule to tell you when to normalize or standardize your data. You can always start by fitting your model to raw, normalized and standardized data and compare the performance for best results.**

- #### **It is a good practice to fit the scaler on the training data and then use it to transform the testing data. This would avoid any data leakage during the model testing process. Also, the scaling of target values is generally not required.**

## **`What does _ mean in Python?`**
- #### **1. Use In Interpreter. Python automatically stores the value of the last expression in the interpreter to a particular variable called "_". You can also assign these value to another variable if you want. You can use it as a normal variable.**

In [108]:
names = ["Rajesh",'sharma']
names

['Rajesh', 'sharma']

In [109]:
print(_)

['Rajesh', 'sharma']


In [110]:
print((_ + _))

['Rajesh', 'sharma', 'Rajesh', 'sharma']


In [111]:
nums = np.array([3,4])

In [112]:
print(_)

['Rajesh', 'sharma']


In [113]:
nums

array([3, 4])

In [114]:
print(_)

[3 4]


In [118]:
print(_ , _)

[3 4] [3 4]


In [116]:
print(_ + _)

[6 8]


In [115]:
print(_ * _)

[ 9 16]


In [121]:
print(_ - _*2)

[-3 -4]


In [122]:
print(_*2 / _)

[2. 2.]
