### Example 01

In [79]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

ocs = np.array(['The sun is shining','The weather is sweet','The sun is shining and the weather is sweet'])

count = CountVectorizer()
docs = np.array(['The sun is shining','The weather is sweet','The sun is shining and the weather is sweet'])

bag = count.fit_transform(docs)

print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [80]:
# tf-idf = tf(i,d) * idf(t,d)

# td(i,d) = term frequency

# idf(t,d) = 	log(n/1+df(t,d)) :  inverse document frequency  df(t,d) : no. of documents d that contain the term t

tfidf = TfidfTransformer()

np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


### Example 02

In [25]:
import pandas as pd

train = [("Thanks for an excellent report", "pos"),
         ("Your service is very quick and fast", "pos"),
        ("I am pleased with your service", "pos"),
        ("I did not know i was diabetic until you gave me this report", "neg"),
        ("Service - Little slow, probably because too many people.", "neg"),
        ("The place is not easy to locate", "neg"),
        ("The place is very easy to locate", "pos"),
        ("Not satisfied will take a second opinion", "neg"),
        ("No human contact everything is so robotic here", "neg")]

df = pd.DataFrame(train,columns=['review','sentiment'])

df.head()

Unnamed: 0,review,sentiment
0,Thanks for an excellent report,pos
1,Your service is very quick and fast,pos
2,I am pleased with your service,pos
3,I did not know i was diabetic until you gave m...,neg
4,"Service - Little slow, probably because too ma...",neg


In [65]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopwords = stopwords.words('english')

In [66]:
data = list(df['review'].values)

data

['Thanks for an excellent report',
 'Your service is very quick and fast',
 'I am pleased with your service',
 'I did not know i was diabetic until you gave me this report',
 'Service - Little slow, probably because too many people.',
 'The place is not easy to locate',
 'The place is very easy to locate',
 'Not satisfied will take a second opinion',
 'No human contact everything is so robotic here']

In [67]:
data_token = [word_tokenize(x.lower()) for x in data]
clean_data = []

for sent in data_token:
    # print([x for x in sent if (x not in stopwords and x not in "-.,")])
    clean_data.append([x for x in sent if (x not in stopwords and x not in "-.,")])

clean_data

[['thanks', 'excellent', 'report'],
 ['service', 'quick', 'fast'],
 ['pleased', 'service'],
 ['know', 'diabetic', 'gave', 'report'],
 ['service', 'little', 'slow', 'probably', 'many', 'people'],
 ['place', 'easy', 'locate'],
 ['place', 'easy', 'locate'],
 ['satisfied', 'take', 'second', 'opinion'],
 ['human', 'contact', 'everything', 'robotic']]

In [68]:
vocabs = list(set([val for sublist in clean_data for val in sublist]))

# Get Word2Id
word2id={}

i = 1
for vocab in vocabs:
    case = {vocab:i}
    word2id.update(case)
    i= i+1
    
print(word2id)

id2word = {v:k for k, v in word2id.items()}

print("\n")
print(id2word)

print("\n")
wids = [[word2id[y.lower()] for y in x] for x in clean_data]

wids

{'pleased': 1, 'take': 2, 'satisfied': 3, 'probably': 4, 'contact': 5, 'little': 6, 'diabetic': 7, 'place': 8, 'service': 9, 'robotic': 10, 'excellent': 11, 'slow': 12, 'many': 13, 'know': 14, 'opinion': 15, 'fast': 16, 'thanks': 17, 'locate': 18, 'people': 19, 'easy': 20, 'human': 21, 'everything': 22, 'gave': 23, 'second': 24, 'report': 25, 'quick': 26}


{1: 'pleased', 2: 'take', 3: 'satisfied', 4: 'probably', 5: 'contact', 6: 'little', 7: 'diabetic', 8: 'place', 9: 'service', 10: 'robotic', 11: 'excellent', 12: 'slow', 13: 'many', 14: 'know', 15: 'opinion', 16: 'fast', 17: 'thanks', 18: 'locate', 19: 'people', 20: 'easy', 21: 'human', 22: 'everything', 23: 'gave', 24: 'second', 25: 'report', 26: 'quick'}




[[17, 11, 25],
 [9, 26, 16],
 [1, 9],
 [14, 7, 23, 25],
 [9, 6, 12, 4, 13, 19],
 [8, 20, 18],
 [8, 20, 18],
 [3, 2, 24, 15],
 [21, 5, 22, 10]]

In [76]:
from tensorflow.keras.preprocessing import sequence

x = sequence.pad_sequences(wids, maxlen=26,padding='post')
print(x.shape)  # 9: Senetnces 6: max Lengt of Senetnce after cleanup

x

(9, 26)


array([[17, 11, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9, 26, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [14,  7, 23, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9,  6, 12,  4, 13, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8, 20, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8, 20, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  2, 24, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [21,  5, 22, 10,  0,  0, 

In [77]:
# TF-IDf
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = TfidfTransformer(norm='l2', use_idf=True)

tt_matrix = vectorizer.fit_transform(np.array(x))

tt_matrix.todense()

matrix([[0.49622851, 0.32108904, 0.80663442, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.27516549, 0.79492254, 0.54072369, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ],
        [0.11043153, 0.99388373, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.

In [78]:
import pandas as pd

pd.DataFrame(tt_matrix.todense())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.496229,0.321089,0.806634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.275165,0.794923,0.540724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.110432,0.993884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.270279,0.13514,0.490814,0.817183,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.143243,0.095495,0.211113,0.107791,0.539908,0.789097,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.272818,0.682046,0.678516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.272818,0.682046,0.678516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.081296,0.054197,0.718887,0.688226,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.572814,0.136384,0.663317,0.461837,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
