In [None]:
import requests
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re

In [5]:
Df_pd = pd.read_csv("TAJReviews.csv")
Df_pd

Unnamed: 0,UserName,Reviews,Rating,Date
0,H Kim,World's Largest White Marble Structure - Very ...,5 stars,a week ago
1,Kaavish Lekchumanan,Really a beautiful and mesmerizing place to sp...,5 stars,a month ago
2,Sree Hari,One of the most beautiful place I have ever se...,5 stars,3 weeks ago
3,Nikhil Jadon,It was amazing and awesome Experience. I visit...,5 stars,2 months ago
4,Nagen kumar Sarangi,One of the most popular place all over the wor...,5 stars,3 days ago
...,...,...,...,...
453,Gulfam Khan,The symbol of love ❤️ beauty 😍 great place to ...,5 stars,4 months ago
454,Abhishek Julaha,Proud to have such a place in india ... truly ...,5 stars,3 months ago
455,anurag sharma,Amazing experience..no words to describe the b...,5 stars,3 months ago
456,Banoth Jithender,Very beautiful and enjoyed a lot. it is unforg...,5 stars,3 months ago


In [6]:
Rev_list=Df_pd["Reviews"]

In [7]:
def preprocess(text):
    clean_data = []
    for x in (text[:]): 
        new_text = re.sub('<.*?>', '', x)   # remove HTML tags
        new_text = re.sub(r'[^\w\s]', '', new_text) # remove punc.
        new_text = re.sub(r'\d+','',new_text)# remove numbers
        new_text = new_text.lower() # lower case, .upper() for upper          
        if new_text != '':
            clean_data.append(new_text)
    return clean_data

In [8]:
def tokenization_w(words):
    w_new = []
    for w in (words[:]):
        w_token = word_tokenize(w)
        if w_token != '':
            w_new.append(w_token)
    return w_new

In [9]:
lemmatizer = WordNetLemmatizer()
def lemmatization(words):
    new = []
    for i in range(len(Rev_list)):
        lem_words = [lemmatizer.lemmatize(x) for x in (words[:][i])]
        new.append(lem_words)
    return new

In [10]:
clean_list = preprocess(Rev_list) #removes punctuation, see above
clean_words = tokenization_w(clean_list) # word tokenization
lem = lemmatization(clean_words)
lem

[['world',
  'largest',
  'white',
  'marble',
  'structure',
  'very',
  'hot',
  'during',
  'summer',
  'visit',
  'be',
  'prepared',
  'to',
  'sweat',
  'when',
  'you',
  'enter',
  'here',
  'and',
  'dont',
  'buy',
  'anything',
  'with',
  'white',
  'marble',
  'statue',
  'they',
  'are',
  'overpriced',
  'do',
  'not',
  'buy',
  'the',
  'white',
  'marble',
  'gift',
  'a',
  'they',
  'are',
  'x',
  'time',
  'expensive',
  'it',
  'is',
  'tempting',
  'but',
  'it',
  'is',
  'useless'],
 ['really',
  'a',
  'beautiful',
  'and',
  'mesmerizing',
  'place',
  'to',
  'spend',
  'your',
  'time',
  'with',
  'your',
  'loved',
  'one',
  'every',
  'inch',
  'tell',
  'you',
  'one',
  'beautiful',
  'technical',
  'n',
  'marvellous',
  'story',
  'you',
  'should',
  'recruit',
  'a',
  'guide',
  'or',
  'do',
  'a',
  'proper',
  'research',
  'then',
  'enter',
  'to',
  'feel',
  'the',
  'effort',
  'and',
  'brain'],
 ['one',
  'of',
  'the',
  'most',
  'be

In [12]:
new_review = []
for i in lem:
    new_review.append(" ".join(i))
new_review

['world largest white marble structure very hot during summer visit be prepared to sweat when you enter here and dont buy anything with white marble statue they are overpriced do not buy the white marble gift a they are x time expensive it is tempting but it is useless',
 'really a beautiful and mesmerizing place to spend your time with your loved one every inch tell you one beautiful technical n marvellous story you should recruit a guide or do a proper research then enter to feel the effort and brain',
 'one of the most beautiful place i have ever seen in my life it a place peaceful place i have seen in my life i visited the place during our college tour it a monument of true love also it architectural detail are amazing probably',
 'it wa amazing and awesome experience i visited this place and now i revisited the memorial of love along with my better behalf i am proud because of one of the seven wonder of the world in my country india come back with lot of',
 'one of the most popula

In [13]:
data = pd.DataFrame({'review':new_review})
data['review']

0      world largest white marble structure very hot ...
1      really a beautiful and mesmerizing place to sp...
2      one of the most beautiful place i have ever se...
3      it wa amazing and awesome experience i visited...
4      one of the most popular place all over the wor...
                             ...                        
453    the symbol of love beauty great place to explo...
454    proud to have such a place in india truly it t...
455    amazing experienceno word to describe the beau...
456    very beautiful and enjoyed a lot it is unforge...
457    one of the world heritage place mughal emperor...
Name: review, Length: 458, dtype: object

In [14]:
data['word_count'] = data['review'].apply(lambda x: len(str(x).split(" ")))
data[['review','word_count']].head()

Unnamed: 0,review,word_count
0,world largest white marble structure very hot ...,49
1,really a beautiful and mesmerizing place to sp...,42
2,one of the most beautiful place i have ever se...,45
3,it wa amazing and awesome experience i visited...,45
4,one of the most popular place all over the wor...,28


In [15]:
data['char_count'] = data['review'].str.len() ## this also includes spaces
data[['review','char_count']].head()

Unnamed: 0,review,char_count
0,world largest white marble structure very hot ...,267
1,really a beautiful and mesmerizing place to sp...,233
2,one of the most beautiful place i have ever se...,229
3,it wa amazing and awesome experience i visited...,227
4,one of the most popular place all over the wor...,174


In [16]:
def avg_word(sentence):
  words = sentence.split()
  print(words)
  print(len(words))
  print(sum(len(word) for word in words))
  if words == []:
    return 0
  return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['review'].apply(lambda x: avg_word(x))
data[['review','avg_word']].head()

['world', 'largest', 'white', 'marble', 'structure', 'very', 'hot', 'during', 'summer', 'visit', 'be', 'prepared', 'to', 'sweat', 'when', 'you', 'enter', 'here', 'and', 'dont', 'buy', 'anything', 'with', 'white', 'marble', 'statue', 'they', 'are', 'overpriced', 'do', 'not', 'buy', 'the', 'white', 'marble', 'gift', 'a', 'they', 'are', 'x', 'time', 'expensive', 'it', 'is', 'tempting', 'but', 'it', 'is', 'useless']
49
219
['really', 'a', 'beautiful', 'and', 'mesmerizing', 'place', 'to', 'spend', 'your', 'time', 'with', 'your', 'loved', 'one', 'every', 'inch', 'tell', 'you', 'one', 'beautiful', 'technical', 'n', 'marvellous', 'story', 'you', 'should', 'recruit', 'a', 'guide', 'or', 'do', 'a', 'proper', 'research', 'then', 'enter', 'to', 'feel', 'the', 'effort', 'and', 'brain']
42
192
['one', 'of', 'the', 'most', 'beautiful', 'place', 'i', 'have', 'ever', 'seen', 'in', 'my', 'life', 'it', 'a', 'place', 'peaceful', 'place', 'i', 'have', 'seen', 'in', 'my', 'life', 'i', 'visited', 'the', 'pla

Unnamed: 0,review,avg_word
0,world largest white marble structure very hot ...,4.469388
1,really a beautiful and mesmerizing place to sp...,4.571429
2,one of the most beautiful place i have ever se...,4.111111
3,it wa amazing and awesome experience i visited...,4.066667
4,one of the most popular place all over the wor...,5.25


In [17]:
#Number of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['stopwords'] = data['review'].apply(lambda x: len([x for x in x.split() if x in stop]))
data[['review','stopwords']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,review,stopwords
0,world largest white marble structure very hot ...,22
1,really a beautiful and mesmerizing place to sp...,17
2,one of the most beautiful place i have ever se...,22
3,it wa amazing and awesome experience i visited...,23
4,one of the most popular place all over the wor...,13


In [18]:
#Number of special characters
data['hastags'] = data['review'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data[['review','hastags']].head()

Unnamed: 0,review,hastags
0,world largest white marble structure very hot ...,0
1,really a beautiful and mesmerizing place to sp...,0
2,one of the most beautiful place i have ever se...,0
3,it wa amazing and awesome experience i visited...,0
4,one of the most popular place all over the wor...,0


In [19]:
#Number of numerics
data['numerics'] = data['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[['review','numerics']].head()

Unnamed: 0,review,numerics
0,world largest white marble structure very hot ...,0
1,really a beautiful and mesmerizing place to sp...,0
2,one of the most beautiful place i have ever se...,0
3,it wa amazing and awesome experience i visited...,0
4,one of the most popular place all over the wor...,0


In [20]:
#Number of Uppercase words
data['upper'] = data['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data[['review','upper']].head()

Unnamed: 0,review,upper
0,world largest white marble structure very hot ...,0
1,really a beautiful and mesmerizing place to sp...,0
2,one of the most beautiful place i have ever se...,0
3,it wa amazing and awesome experience i visited...,0
4,one of the most popular place all over the wor...,0


In [21]:
pos_family={
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
from textblob import TextBlob, Word, Blobber
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
                print(ppo, tup)
    except:
        pass
    return cnt

data['noun_count'] = data['review'].apply(lambda x: check_pos_tag(x, 'noun'))
data['verb_count'] = data['review'].apply(lambda x: check_pos_tag(x, 'verb'))
data['adj_count'] = data['review'].apply(lambda x: check_pos_tag(x, 'adj'))
data['adv_count'] = data['review'].apply(lambda x: check_pos_tag(x, 'adv'))
data['pron_count'] = data['review'].apply(lambda x: check_pos_tag(x, 'pron'))
data[['review','noun_count','verb_count','adj_count', 'adv_count', 'pron_count' ]].head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
VB ('see', 'VB')
VB ('recommend', 'VB')
VB ('be', 'VB')
VBN ('recommended', 'VBN')
VBZ ('etc', 'VBZ')
VBZ ('is', 'VBZ')
VB ('make', 'VB')
VB ('pickpocket', 'VB')
VB ('travel', 'VB')
VBN ('enjoyed', 'VBN')
VBZ ('is', 'VBZ')
VBZ ('is', 'VBZ')
VBD ('wa', 'VBD')
VBN ('built', 'VBN')
VBZ ('is', 'VBZ')
VBZ ('is', 'VBZ')
VBN ('known', 'VBN')
VBZ ('is', 'VBZ')
VBN ('hidden', 'VBN')
VB ('ha', 'VB')
VB ('known', 'VB')
VB ('visit', 'VB')
VBG ('amazing', 'VBG')
VBN ('used', 'VBN')
VB ('build', 'VB')
VBD ('came', 'VBD')
VB ('visit', 'VB')
VB ('visit', 'VB')
VBN ('come', 'VBN')
VB ('visit', 'VB')
VB ('visit', 'VB')
VBZ ('is', 'VBZ')
VB ('be', 'VB')
VBN ('seen', 'VBN')
VB ('be', 'VB')
VBN ('seen', 'VBN')
VBZ ('is', 'VBZ')
VBG ('amazing', 'VBG')
VBZ ('is', 'VBZ')
VBP ('want', 'VBP')
VB ('enjoy', 'VB')
VB ('please', 'VB')
VB ('do', 'VB')
VBP ('enter', 'VBP')
VBG ('amazing', 'VBG')
VBP ('dont', 'VBP')
VB ('want', 'VB')
VB ('go', 'VB')
VBZ 

Unnamed: 0,review,noun_count,verb_count,adj_count,adv_count,pron_count
0,world largest white marble structure very hot ...,9,11,12,4,5
1,really a beautiful and mesmerizing place to sp...,10,7,5,2,4
2,one of the most beautiful place i have ever se...,13,8,4,4,6
3,it wa amazing and awesome experience i visited...,11,7,3,2,3
4,one of the most popular place all over the wor...,8,4,3,2,1


In [22]:
data.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,noun_count,verb_count,adj_count,adv_count,pron_count
0,world largest white marble structure very hot ...,49,267,4.469388,22,0,0,0,9,11,12,4,5
1,really a beautiful and mesmerizing place to sp...,42,233,4.571429,17,0,0,0,10,7,5,2,4
2,one of the most beautiful place i have ever se...,45,229,4.111111,22,0,0,0,13,8,4,4,6
3,it wa amazing and awesome experience i visited...,45,227,4.066667,23,0,0,0,11,7,3,2,3
4,one of the most popular place all over the wor...,28,174,5.25,13,0,0,0,8,4,3,2,1


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv=CountVectorizer()
A_vec = cv.fit_transform(new_review)
print(A_vec.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [26]:
tv=TfidfVectorizer()
t_vec = tv.fit_transform(new_review)
print(t_vec.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
feature_names = tv.get_feature_names()

dense = t_vec.todense()
denselist = dense.tolist()
daf = pd.DataFrame(denselist, columns=feature_names)
feature_names

['abd',
 'able',
 'about',
 'above',
 'absolute',
 'absolutely',
 'abusive',
 'acclaim',
 'according',
 'accuracy',
 'achievement',
 'acre',
 'across',
 'actually',
 'adhaar',
 'admirable',
 'admired',
 'admittedly',
 'adorn',
 'adorned',
 'advance',
 'advice',
 'advisable',
 'af',
 'affair',
 'affection',
 'after',
 'afternoon',
 'again',
 'agara',
 'ago',
 'agoa',
 'agra',
 'agraits',
 'agravery',
 'agreement',
 'ahead',
 'aint',
 'air',
 'alex_khan_official',
 'all',
 'allah',
 'allow',
 'allowed',
 'allows',
 'almost',
 'along',
 'alongside',
 'alot',
 'also',
 'although',
 'always',
 'am',
 'amazed',
 'amazes',
 'amazing',
 'amazingly',
 'amazon',
 'ambiance',
 'america',
 'among',
 'amongst',
 'amount',
 'an',
 'ancestor',
 'ancient',
 'and',
 'angel',
 'angle',
 'animal',
 'another',
 'any',
 'anybody',
 'anyone',
 'anything',
 'appears',
 'appreciate',
 'appreciated',
 'approached',
 'approximately',
 'april',
 'aptly',
 'archetec',
 'architechture',
 'architect',
 'architectja

In [29]:
df_c =pd.concat([daf,data], axis=1)
df_c.head()

Unnamed: 0,abd,able,about,above,absolute,absolutely,abusive,acclaim,according,accuracy,achievement,acre,across,actually,adhaar,admirable,admired,admittedly,adorn,adorned,advance,advice,advisable,af,affair,affection,after,afternoon,again,agara,ago,agoa,agra,agraits,agravery,agreement,ahead,aint,air,alex_khan_official,...,worldits,worldthis,worldwide,worm,worn,worry,worst,worth,worthy,would,wow,written,writting,yamuna,year,yes,yet,you,youd,youll,your,youre,yourself,youve,ˈmɛːɦəl,ˌtɑːdʒ,ˌtɑːʒ,review,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,noun_count,verb_count,adj_count,adv_count,pron_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,world largest white marble structure very hot ...,49,267,4.469388,22,0,0,0,9,11,12,4,5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177244,0.0,0.0,0.2526,0.0,0.0,0.0,0.0,0.0,0.0,really a beautiful and mesmerizing place to sp...,42,233,4.571429,17,0,0,0,10,7,5,2,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,one of the most beautiful place i have ever se...,45,229,4.111111,22,0,0,0,13,8,4,4,6
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,it wa amazing and awesome experience i visited...,45,227,4.066667,23,0,0,0,11,7,3,2,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,one of the most popular place all over the wor...,28,174,5.25,13,0,0,0,8,4,3,2,1
