In [None]:
"""
Problem Statement No. 16 
Consider the Amazon Alexa Reviews Dataset. This dataset consists of a nearly 3000 Amazon customer reviews (input 
text), star ratings, date of review, variant and feedback of various amazon Alexa products like Alexa Echo, Echo dots, 
Alexa Firesticks etc. Perform following operations on this dataset. 
(I) Remove all punctuations from review text. 
(II) Tokenize the review text into words. 
(III) Remove the Stopwords from the tokenized text. 
(IV) Perform stemming & lemmatization on the review text. 
(V) Perform the word vectorization on review text using Bag of Words technique. 
(VI) Create representation of Review Text by calculating Term Frequency and Inverse Document Frequency (TF-IDF)
"""

In [85]:
import pandas as pd

In [86]:
df = pd.read_csv('Alexa-Dataset.csv')

In [87]:
df.sample(5)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
965,5,29-Jul-18,Charcoal Fabric,I have been wanting one of these for a while n...,1
203,5,29-Jul-18,Charcoal Fabric,Very nice! I'm impressed - wish she had more ...,1
2371,5,30-Jul-18,Configuration: Fire TV Stick,Extremely easy to use! Switched away from dire...,1
806,2,30-Jul-18,Charcoal Fabric,Sound is terrible if u want good music too get...,0
2727,5,30-Jul-18,Black Dot,Love it. Added it to an upstairs bedroom,1


In [88]:
df['verified_reviews'][2]

'Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you.  I like being able to turn lights on and off while away from home.'

In [89]:
df['verified_reviews'] = df['verified_reviews'].str.lower()

In [90]:
df['verified_reviews'][2]

'sometimes while playing a game, you can answer a question correctly but alexa says you got it wrong and answers the same as you.  i like being able to turn lights on and off while away from home.'

In [91]:
import string 

In [92]:
punc = string.punctuation

In [93]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [94]:
def remove_punctuation(data):
    for char in punc:
        data = data.replace(char,'')
    return data

In [95]:
text = 'i have a data . ,'

In [96]:
remove_punctuation(text)

'i have a data  '

In [97]:
df['verified_reviews'] = df['verified_reviews'].astype(str)

In [98]:
df['verified_reviews'] =df['verified_reviews'].apply(remove_punctuation)

In [99]:
df['verified_reviews'][2]

'sometimes while playing a game you can answer a question correctly but alexa says you got it wrong and answers the same as you  i like being able to turn lights on and off while away from home'

In [108]:
import nltk

In [109]:
from nltk.tokenize import word_tokenize

In [110]:
def tokenize_word(data):
    tokens = nltk.word_tokenize(data)
    return tokens

In [111]:
tokenize_word(text)

['i', 'have', 'a', 'data', '.', ',']

In [112]:
df['verified_reviews'] = df['verified_reviews'].apply(tokenize_word)

In [113]:
df['verified_reviews'][2]

['sometimes',
 'playing',
 'game',
 'answer',
 'question',
 'correctly',
 'alexa',
 'says',
 'got',
 'wrong',
 'answers',
 'like',
 'able',
 'turn',
 'lights',
 'away',
 'home']

In [100]:
from nltk.corpus import stopwords

In [101]:
nltk.corpus.stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [102]:
df['verified_reviews'] = df['verified_reviews'].astype(str)

In [103]:
def remove_stopwords(data):
    stop_words = set(stopwords.words("english"))
    words = data.split()
    filtered_word = []
    for word in words:
        if word.lower() not in stop_words:
            filtered_word.append(word)
    filtered_text = ' '.join(filtered_word)
    return filtered_text

In [104]:
data = 'hii i am a coder'

In [105]:
remove_stopwords(data)

'hii coder'

In [106]:
df['verified_reviews'] = df['verified_reviews'].apply(lambda x: remove_stopwords(x))

In [107]:
df['verified_reviews'][2]

'sometimes playing game answer question correctly alexa says got wrong answers like able turn lights away home'

In [114]:
from nltk.stem import PorterStemmer

In [115]:
stemmer = PorterStemmer()

In [116]:
def perform_stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

In [117]:
text1 = 'study studying studied' 

In [118]:
 words = nltk.word_tokenize(text1)

In [119]:
perform_stemming(words)

'studi studi studi'

In [120]:
df['verified_reviews'] = df['verified_reviews'].apply(perform_stemming)

In [70]:
word_net =  WordNetLemmatizer()

In [71]:
from nltk.stem import WordNetLemmatizer

In [72]:
sentence = ' study studying come came coming ate eat'
sentence_words = nltk.word_tokenize(sentence)

In [73]:
for word in sentence_words:
    print(word , word_net.lemmatize(word , pos = 'v'))

study study
studying study
come come
came come
coming come
ate eat
eat eat


In [74]:
from sklearn.feature_extraction.text import CountVectorizer

In [75]:
cov = CountVectorizer()

In [80]:
bow = cov.fit_transform(df['verified_reviews'])

In [81]:
print(bow[0].toarray())

[[0 0 0 ... 0 0 0]]


In [82]:
print(bow[15].toarray())

[[0 0 0 ... 0 0 0]]


In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [122]:
tfidf = TfidfVectorizer()

In [126]:
tfidf_vector = tfidf.fit_transform(df['verified_reviews'])

In [127]:
tfidf_array = tfidf_vector.toarray()

In [128]:
tfidf_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [131]:
word_Set = tfidf.get_feature_names_out()

In [132]:
word_Set

array(['072318', '10', '100', ..., 'zzzz', 'zzzzzzz', 'útil'],
      dtype=object)

In [134]:
df_tf_idf = pd.DataFrame(tfidf_array, columns = word_Set)

df_tf_idf

Unnamed: 0,072318,10,100,1000,100x,1010,1030pm,11,1100sf,1220,...,youtubeit,yr,yup,zero,zigbe,zonkedout,zwave,zzzz,zzzzzzz,útil
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.335516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
