In [70]:

import numpy as np ; import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim.models  import Word2Vec
from sklearn.model_selection import train_test_split
from seedtools  import connect
import pandas as pd
import re ,string,unicodedata,contractions
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
STOPWORDS =  stopwords.words("english")

In [77]:
def clean_text(text):
	text = str(text).lower()
	text = re.sub("[^\w\s]"," ",text)
	text =  ' '.join([w for w in text.split() if w not in STOPWORDS])
	return text

In [78]:
data   = pd.read_csv(connect("amazon_alexa.tsv"),sep="\t")[["verified_reviews","feedback"]]
data["cleaned"] = data["verified_reviews"].apply(lambda  x : clean_text(x))
x_train,x_test,y_train,y_test =  train_test_split(data["cleaned"].values,data["feedback"].values,test_size=0.2,random_state=42)
x_train.shape

(2520,)

### There are several methods which we convert Text ->  Numeric: 
-  Bag of words (CountVectorizer)
- Tfidf 
- Word2Vec 
- Tokenizer (tensorflow) 

**#1 CountVectorizer**

In [79]:

cv  = CountVectorizer()
x_train_cv =  cv.fit_transform(x_train).toarray()
x_test_cv =  cv.transform(x_test).toarray()
x_train_cv.shape

(2520, 3621)

In [81]:
print(x_train_cv[0].shape)
print("indexed where values are above 0 : ",np.where(x_train_cv[0]> 0))
print("Value at index 44 of x_train_cv[0] :",x_train_cv[0][213])

(3621,)
indexed where values are above 0 :  (array([ 213,  775, 1056, 2291, 2950, 3223], dtype=int64),)
Value at index 44 of x_train_cv[0] : 1


 **#TfidfVectorizer**

In [82]:

tf  = TfidfVectorizer()
x_train_tf =  tf.fit_transform(x_train).toarray()
x_test_tf =  tf.transform(x_test).toarray()
x_train_tf.shape

(2520, 3621)

In [84]:
print(x_train_tf[0].shape)
print("indexed where values are above 0 : ",np.where(x_train_tf[0]> 0))
print("Value at index 44 of x_train_cv[0] :",x_train_tf[0][213])

(3621,)
indexed where values are above 0 :  (array([ 213,  775, 1056, 2291, 2950, 3223], dtype=int64),)
Value at index 44 of x_train_cv[0] : 0.24965385027600473


**#Word2Vec**

In [85]:
sentences = [x.split() for x in x_train]

# Skip-Gram (sg =1) CBOW (sg=0)
#cbow model
model_cbow =  Word2Vec(sentences,min_count=2,vector_size=100,window=10)

#skip gram model 
model_skip=  Word2Vec(sentences,min_count=2,vector_size=100,window=10,sg=1)

In [86]:
# for single word
word_n =  "thank"
word2 = "good"
print(f"Word Vector of word `{word_n}` :",model_cbow.wv[word_n][:5])
print(f"most similar words of `{word_n}` :",model_cbow.wv.most_similar(word_n)[:3])
print(f"Similarity between {word_n}  and {word2} :",model_cbow.wv.similarity(word_n,word2))

Word Vector of word `thank` : [-0.12075105  0.11025674  0.01652131 -0.02822109  0.04629619]
most similar words of `thank` : [('well', 0.9993425607681274), ('two', 0.999327540397644), ('use', 0.9993176460266113)]
Similarity between thank  and good : 0.99910367


In [None]:
def vectorize_(sentence):
    sents = sentence.split()
    word_vecs =  [model_cbow.wv[x]  for x in sents  if x in model_cbow.wv]
    word_vecs =  np.array(word_vecs).mean(axis=0)
    return word_vecs

x_train_vec =  np.array([vectorize_(x)  for x in x_train])
x_train_vec # Now you can train this data 

  word_vecs =  np.array(word_vecs).mean(axis=0)
  ret = ret.dtype.type(ret / rcount)


[array([-1.70872167e-01,  1.58033296e-01,  3.39210294e-02, -5.14773875e-02,
         7.49752447e-02, -4.30107206e-01,  7.20996186e-02,  5.83478689e-01,
        -1.95824012e-01, -2.10915759e-01, -1.56858876e-01, -3.55972618e-01,
        -9.21953917e-02,  6.75577149e-02,  1.41761199e-01, -1.71586350e-01,
         2.30188090e-02, -3.03247720e-01, -5.68223707e-02, -4.86854404e-01,
         7.96349421e-02,  1.45930871e-01,  5.39920814e-02, -1.87488124e-01,
        -6.06131144e-02, -1.94907561e-02, -2.78016329e-01, -2.32984185e-01,
        -2.54968971e-01,  8.07784200e-02,  3.16371322e-01, -4.37433619e-06,
         6.87203407e-02, -2.15913698e-01, -1.64225549e-01,  3.48832995e-01,
         4.72277887e-02, -1.99409708e-01, -1.29548088e-01, -4.07135993e-01,
         4.17068303e-02, -2.31046081e-01, -5.90922125e-02,  2.34043803e-02,
         2.77745247e-01, -2.01555882e-02, -1.82567894e-01, -1.07776143e-01,
         7.49249905e-02,  2.36900285e-01,  7.20405653e-02, -3.17480713e-01,
         3.5