In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Review Rating , Service & Support , Information Depth . Content, User Friendly, Time to Load, Overall Rating
RatingColumn = "Service & Support"


# read data
reviews_df_main = pd.read_excel("RC.xlsx")
reviews_df = pd.DataFrame()
reviews_df['review'] = reviews_df_main["Review"]
reviews_df['OTA'] = reviews_df_main['OTA']
reviews_df['rating'] = reviews_df_main[RatingColumn]

reviews_df.head()

Unnamed: 0,review,OTA,rating
0,"Hello all, I am writing a review on Makemytrip...",MMT,4.0
1,"Hello Friends,\nI have booked a room in Raddis...",MMT,1.0
2,I was trying to book an intl flight for 4 pass...,MMT,
3,Unbeatable match with other trip booking apps ...,MMT,4.0
4,They cannot be reliable in matter of hotel boo...,MMT,1.0


In [None]:
!pip install xlsxwriter
import xlsxwriter 

workbook = xlsxwriter.Workbook('Rating.xlsx')



In [None]:
#cleaning 

#removing all the \n 
reviews_df["review"] = [x.replace("\n", " ") for x in reviews_df["review"] ]

#removing empty values
reviews_df.dropna(inplace= True)


In [None]:

# return the wordnet object value corresponding to the POS tag
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet


def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
      
def get_noun(pos_tag):
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
reviews_df["review"] = reviews_df["review"].apply(lambda x : str(x))
reviews_df["review_clean"] = reviews_df['review'].apply(lambda x: clean_text(x))

reviews_df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,review,OTA,rating,review_clean
0,"Hello all, I am writing a review on Makemytrip...",MMT,4.0,hello write review makemytrip.com reference ex...
1,"Hello Friends, I have booked a room in Raddiso...",MMT,1.0,hello friend book room raddison atrium banglor...
3,Unbeatable match with other trip booking apps ...,MMT,4.0,unbeatable match trip book apps trust worthy a...
4,They cannot be reliable in matter of hotel boo...,MMT,1.0,cannot reliable matter hotel booking happen ha...
10,I had recently visited Udipur( rajsthan) with ...,MMT,4.0,recently visit udipur rajsthan family vacation...


In [None]:
# add sentiment anaylsis columns
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["review"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)
reviews_df.head()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,review,OTA,rating,review_clean,neg,neu,pos,compound
0,"Hello all, I am writing a review on Makemytrip...",MMT,4.0,hello write review makemytrip.com reference ex...,0.0,0.785,0.215,0.997
1,"Hello Friends, I have booked a room in Raddiso...",MMT,1.0,hello friend book room raddison atrium banglor...,0.097,0.86,0.043,-0.4435
3,Unbeatable match with other trip booking apps ...,MMT,4.0,unbeatable match trip book apps trust worthy a...,0.0,0.729,0.271,0.9837
4,They cannot be reliable in matter of hotel boo...,MMT,1.0,cannot reliable matter hotel booking happen ha...,0.0,0.954,0.046,0.431
10,I had recently visited Udipur( rajsthan) with ...,MMT,4.0,recently visit udipur rajsthan family vacation...,0.0,0.825,0.175,0.9575


In [None]:
# add number of characters column
reviews_df["nb_chars"] = reviews_df["review"].apply(lambda x: len(x))

# add number of words column
reviews_df["nb_words"] = reviews_df["review"].apply(lambda x: len(x.split(" ")))

In [None]:

ratingList = []
for i in range(5):
  ratingList.append(reviews_df[reviews_df["rating"]  == i+1.0 ].groupby('OTA')["rating"].apply(list))

'''for i in range(5):
  for j in range(5):
    print(len(ratingList[i][j]))

[len(ratingList[i]["MMT"]) for i in range(5) ]  
'''
len(reviews_df["rating"])
print(len(ratingList[0][0]) + len(ratingList[1][0]) + len(ratingList[2][0]) + len(ratingList[4][0]) + len(ratingList[3][0]))

250


In [None]:
print(ratingList[1][4])

[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]


In [None]:
!pip install xlsxwriter
import xlsxwriter 
  
workbook = xlsxwriter.Workbook('temp.xlsx') 
worksheet = workbook.add_worksheet() 

sum = []
percent = []


for i in range(5):
  sumval = 0
  for j in range(5):

    sumval += len(ratingList[j][i]) 
  sum.append(sumval)

for i in range(5):
  percent = []
  for j in range(5):

    percent.append(round(((len(ratingList[j][i]) * 100) / sum[i] ) , 1 ))
    worksheet.write(i, j, round(((len(ratingList[j][i]) * 100) / sum[i] ) , 1 )) 
  worksheet.write(i, 5, sum[i])
  percent.append(sum[i])
  print(percent)
workbook.close()
  







Collecting xlsxwriter
[?25l  Downloading https://files.pythonhosted.org/packages/f2/16/da654cfbc0b05f2ad253c0f244b0c2a76c403bb774717b39c92653acb290/XlsxWriter-1.2.6-py2.py3-none-any.whl (141kB)
[K     |██▎                             | 10kB 17.8MB/s eta 0:00:01[K     |████▋                           | 20kB 7.1MB/s eta 0:00:01[K     |███████                         | 30kB 9.7MB/s eta 0:00:01[K     |█████████▎                      | 40kB 6.1MB/s eta 0:00:01[K     |███████████▋                    | 51kB 7.4MB/s eta 0:00:01[K     |██████████████                  | 61kB 8.7MB/s eta 0:00:01[K     |████████████████▎               | 71kB 9.8MB/s eta 0:00:01[K     |██████████████████▋             | 81kB 10.8MB/s eta 0:00:01[K     |████████████████████▉           | 92kB 11.9MB/s eta 0:00:01[K     |███████████████████████▏        | 102kB 9.7MB/s eta 0:00:01[K     |█████████████████████████▌      | 112kB 9.7MB/s eta 0:00:01[K     |███████████████████████████▉    | 122kB 9.

In [None]:

plt.style.use('seaborn')
UniqueRating=['1','2','3','4','5']
OTA = ["ClearTrip", "Goibibo","MMT", "Redbus", "Yatra"]

fig1, axes1 = plt.subplots(1, 1)
fig2, axes2 = plt.subplots(1, 1)
fig3, axes3 = plt.subplots(1, 1)
fig4, axes4 = plt.subplots(1, 1)
fig5, axes5 = plt.subplots(1, 1)

values = [len(ratingList[i][OTA[0]]) for i in range(5) ]  
axes1.set_title(OTA[0]) 
axes1.pie(values,labels=UniqueRating, autopct='%1.1f%%')

values = [len(ratingList[i][OTA[1]]) for i in range(5) ]  
axes2.set_title(OTA[1]) 
axes2.pie(values,labels=UniqueRating, autopct='%1.1f%%')

values = [len(ratingList[i][OTA[2]]) for i in range(5) ] 
axes3.set_title(OTA[2])  
axes3.pie(values,labels=UniqueRating, autopct='%1.1f%%')

values = [len(ratingList[i][OTA[3]]) for i in range(5) ]
axes4.set_title(OTA[3])   
axes4.pie(values,labels=UniqueRating, autopct='%1.1f%%')

values = [len(ratingList[i][OTA[4]]) for i in range(5) ] 
axes5.set_title(OTA[4])  
axes5.pie(values,labels=UniqueRating, autopct='%1.1f%%')

plt.show()
#fig.savefig("fig.png")

In [None]:
import matplotlib.backends.backend_pdf
pdf = matplotlib.backends.backend_pdf.PdfPages(RatingColumn+".pdf")
for fig in range(1, plt.gcf().number + 1): 
    pdf.savefig( fig1 )
    pdf.savefig( fig2 )
    pdf.savefig( fig3 )
    pdf.savefig( fig4 )
    pdf.savefig( fig5 )
    
    
pdf.close()

<Figure size 576x396 with 0 Axes>

In [None]:
print([i for i in range(7)])

[0, 1, 2, 3, 4, 5, 6]


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet


# Review Rating , Service & Support , Information Depth . Content, User Friendly, Time to Load, Overall Rating
Cols = ["Review Rating" , "Service & Support" , "Information Depth" , "Content", "User Friendly ", "Time to Load", "Overall Rating"]

!pip install xlsxwriter
import xlsxwriter 

workbook = xlsxwriter.Workbook('Rating.xlsx')


# read data
reviews_df_main = pd.read_excel("RC.xlsx")

for i in range(7):
  RatingColumn = Cols[i]
  reviews_df = pd.DataFrame()
  reviews_df['review'] = reviews_df_main["Review"]
  reviews_df['OTA'] = reviews_df_main['OTA']
  reviews_df['rating'] = reviews_df_main[RatingColumn]

  #reviews_df.head()


  #cleaning 

  #removing all the \n 
  reviews_df["review"] = [x.replace("\n", " ") for x in reviews_df["review"] ]

  #removing empty values
  reviews_df.dropna(inplace= True)


  # return the wordnet object value corresponding to the POS tag


  def get_wordnet_pos(pos_tag):
      if pos_tag.startswith('J'):
          return wordnet.ADJ
      elif pos_tag.startswith('V'):
          return wordnet.VERB
      elif pos_tag.startswith('N'):
          return wordnet.NOUN
      elif pos_tag.startswith('R'):
          return wordnet.ADV
      else:
          return wordnet.NOUN
        
  def get_noun(pos_tag):
      if pos_tag.startswith('N'):
          return wordnet.NOUN
      
  

  def clean_text(text):
      # lower text
      text = text.lower()
      # tokenize text and remove puncutation
      text = [word.strip(string.punctuation) for word in text.split(" ")]
      # remove words that contain numbers
      text = [word for word in text if not any(c.isdigit() for c in word)]
      # remove stop words
      stop = stopwords.words('english')
      text = [x for x in text if x not in stop]
      # remove empty tokens
      text = [t for t in text if len(t) > 0]
      # pos tag text
      pos_tags = pos_tag(text)
      # lemmatize text
      text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
      # remove words with only one letter
      text = [t for t in text if len(t) > 1]
      # join all
      text = " ".join(text)
      return(text)

  # clean text data
  reviews_df["review"] = reviews_df["review"].apply(lambda x : str(x))
  reviews_df["review_clean"] = reviews_df['review'].apply(lambda x: clean_text(x))

  # add sentiment anaylsis columns
  nltk.download('vader_lexicon')
  from nltk.sentiment.vader import SentimentIntensityAnalyzer

  sid = SentimentIntensityAnalyzer()
  reviews_df["sentiments"] = reviews_df["review"].apply(lambda x: sid.polarity_scores(x))
  reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)


  ratingList = []
  for i in range(5):
    ratingList.append(reviews_df[reviews_df["rating"]  == i+1.0 ].groupby('OTA')["rating"].apply(list))


  worksheet = workbook.add_worksheet(RatingColumn) 

  sum = []
  percent = []
  worksheet.write(0,0, RatingColumn)
  OTA = ["ClearTrip", "Goibibo","MMT", "Redbus", "Yatra"]
  for i in range(1,6):
    worksheet.write(i,0, OTA[i-1])


  for i in range(1,6):
    worksheet.write(0,i, i)

  worksheet.write(0,6, "Total")

  for i in range(5):
    sumval = 0
    for j in range(5):

      sumval += len(ratingList[j][i]) 
    sum.append(sumval)

  for i in range(5):
    percent = []
    for j in range(5):

      percent.append(len(ratingList[j][i]))
      worksheet.write(i+1, j+1, len(ratingList[j][i])) 
    worksheet.write(i+1, 6, sum[i])
    percent.append(sum[i])
    #print(percent)

workbook.close()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[n