In [2]:
import pandas as pd

train = [("Thanks for an excellent report", "pos"),
         ("Your service is very quick and fast", "pos"),
        ("I am pleased with your service", "pos"),
        ("I did not know i was diabetic until you gave me this report", "neg"),
        ("Service - Little slow, probably because too many people.", "neg"),
        ("The place is not easy to locate", "neg"),
        ("The place is very easy to locate", "pos"),
        ("Not satisfied will take a second opinion", "neg"),
        ("No human contact everything is so robotic here", "neg")]

df = pd.DataFrame(train,columns=['review','sentiment'])

df.head()

Unnamed: 0,review,sentiment
0,Thanks for an excellent report,pos
1,Your service is very quick and fast,pos
2,I am pleased with your service,pos
3,I did not know i was diabetic until you gave m...,neg
4,"Service - Little slow, probably because too ma...",neg


In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopwords = stopwords.words('english')

In [4]:
data = list(df['review'].values)

data

['Thanks for an excellent report',
 'Your service is very quick and fast',
 'I am pleased with your service',
 'I did not know i was diabetic until you gave me this report',
 'Service - Little slow, probably because too many people.',
 'The place is not easy to locate',
 'The place is very easy to locate',
 'Not satisfied will take a second opinion',
 'No human contact everything is so robotic here']

In [8]:
data_token = [word_tokenize(x.lower()) for x in data]
clean_data = []

for sent in data_token:
    # print([x for x in sent if (x not in stopwords and x not in "-.,")])
    clean_data.append([x for x in sent if (x not in stopwords and x not in "-.,")])

clean_data

[['thanks', 'excellent', 'report'],
 ['service', 'quick', 'fast'],
 ['pleased', 'service'],
 ['know', 'diabetic', 'gave', 'report'],
 ['service', 'little', 'slow', 'probably', 'many', 'people'],
 ['place', 'easy', 'locate'],
 ['place', 'easy', 'locate'],
 ['satisfied', 'take', 'second', 'opinion'],
 ['human', 'contact', 'everything', 'robotic']]

In [9]:
vocabs = list(set([val for sublist in clean_data for val in sublist]))

# Get Word2Id
word2id={}

i = 1
for vocab in vocabs:
    case = {vocab:i}
    word2id.update(case)
    i= i+1
    
print(word2id)

id2word = {v:k for k, v in word2id.items()}

print("\n")
print(id2word)

print("\n")
wids = [[word2id[y.lower()] for y in x] for x in clean_data]

wids

{'everything': 1, 'opinion': 2, 'easy': 3, 'people': 4, 'many': 5, 'slow': 6, 'locate': 7, 'little': 8, 'know': 9, 'fast': 10, 'human': 11, 'gave': 12, 'satisfied': 13, 'second': 14, 'thanks': 15, 'place': 16, 'contact': 17, 'robotic': 18, 'diabetic': 19, 'quick': 20, 'report': 21, 'pleased': 22, 'service': 23, 'probably': 24, 'take': 25, 'excellent': 26}


{1: 'everything', 2: 'opinion', 3: 'easy', 4: 'people', 5: 'many', 6: 'slow', 7: 'locate', 8: 'little', 9: 'know', 10: 'fast', 11: 'human', 12: 'gave', 13: 'satisfied', 14: 'second', 15: 'thanks', 16: 'place', 17: 'contact', 18: 'robotic', 19: 'diabetic', 20: 'quick', 21: 'report', 22: 'pleased', 23: 'service', 24: 'probably', 25: 'take', 26: 'excellent'}




[[15, 26, 21],
 [23, 20, 10],
 [22, 23],
 [9, 19, 12, 21],
 [23, 8, 6, 24, 5, 4],
 [16, 3, 7],
 [16, 3, 7],
 [13, 25, 14, 2],
 [11, 17, 1, 18]]

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=27)

one_hot = tokenizer.sequences_to_matrix(wids, mode='binary')

one_hot

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
