In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

### 0. Understanding the Business Problem
Uber Inc in the US wants to know:

- the major complaints premium users have about their cab services,
- how these impact service ratings.

We as (technical) consultants to Uber. have to:  
- [a] analyze text reviews of Uber cabs’ US services,  
- [b] relate whether and which different features of these reviews impact overall ratings  
- [c] pinpoint possible areas of improvement.

### 1. Pre-processing: 
- Examine the dataset. 
- ID the columns of interest. 
- Drop special characters, html junk etc. 
- Perform any other preprocessing and text-cleaning activity you think fits this context.

In [2]:
df = pd.read_csv(r"G:\ISB AMPBA\9. Text Analytics\Assignment\uber_reviews_itune.csv",
                 encoding='cp1252')
df.head()

Unnamed: 0,Author_Name,Title,Author_URL,App_Version,Rating,Review,Date
0,#NEVERUBER,Dishonest and Disgusting,https://itunes.apple.com/us/reviews/id663331949,3.434.10005,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,$$Heaven,Free offer,https://itunes.apple.com/us/reviews/id810421958,3.434.10005,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,.Disappointed....,Inaccurate,https://itunes.apple.com/us/reviews/id49598333,3.439.10000,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,.i. andrea,bad,https://itunes.apple.com/us/reviews/id689880334,3.434.10005,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,-:deka:-,Double charged me for an order,https://itunes.apple.com/us/reviews/id124963835,3.434.10005,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


Columns of interest:  
1. Title - Brief summary about the review
2. Rating - Label for supervised learning
3. Review - To extract the sentiment of the complaint
4. Date - Extracting weekday or weekend may give better insight on nature of review

### Data Cleaning

In [3]:
df1 = df.drop(['Author_Name','Author_URL','App_Version'],axis=1)
df1.head()

Unnamed: 0,Title,Rating,Review,Date
0,Dishonest and Disgusting,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,Free offer,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,Inaccurate,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,bad,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,Double charged me for an order,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


In [4]:
# Replacing emoticon with its respective meaning
to_replace=['<U+0001F621>','<U+0001F615>','<U+0001F44E>']
replace_with=['pouting face','confused face','thumbs down']
df1.Review=df1.Review.replace(to_replace, replace_with, regex=True)

In [5]:
df1.Review = df1.Review.str.split('<').str[0]
df1.shape

(490, 4)

In [6]:
# drop empty rows or docs
df1.Review[149]

''

In [7]:
df1['Review'].replace('', np.nan, inplace=True)
df1.dropna(subset=['Review'], inplace=True)
df1.shape

(489, 4)

### Tokenization and Stemming

Below we import NLTK's sentence and word tokenizer, and stemmer. Note the use of list comprehension to bundle both into one  line of efficient code.

Note also the use of regex from *re* to detect and drop any non alphabetic characters from the corpus.

Find below two straightforward user defined funcs to tokenize (and stem).

We will apply these funcs on each doc in the corpus subsequently.

In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# see what kinda output VADER yields on 1 doc first
vs0 = analyzer.polarity_scores(df1.Review.iloc[0]); 
vs0

{'neg': 0.0, 'neu': 0.962, 'pos': 0.038, 'compound': 0.1406}

In [9]:
sent1 = df1.Review.iloc[0]
analyzer.polarity_scores(sent1)

{'neg': 0.0, 'neu': 0.962, 'pos': 0.038, 'compound': 0.1406}

In [10]:
len(df1.Review)

489

In [11]:
sent_list = []
sent1 = analyzer.polarity_scores(df1.Review[1])
sent_list.append(sent1)
sent1 = analyzer.polarity_scores(df1.Review[2])
sent_list.append(sent1)
pd.DataFrame(sent_list)

Unnamed: 0,neg,neu,pos,compound
0,0.136,0.864,0.0,-0.296
1,0.179,0.821,0.0,-0.34


In [12]:
# define unit func to process one doc
from nltk import sent_tokenize, word_tokenize
def vader_unit_func(doc0):
    sents_list0 = sent_tokenize(doc0)
    vs_doc0 = []
    sent_ind = []
    for i in range(len(sents_list0)):
        vs_sent0 = analyzer.polarity_scores(sents_list0[i])
        vs_doc0.append(vs_sent0)
        sent_ind.append(i)
        
    # obtain output as DF    
    doc0_df = pd.DataFrame(vs_doc0)
    doc0_df.insert(0, 'sent_index', sent_ind)  # insert sent index
    doc0_df.insert(doc0_df.shape[1], 'sentence', sents_list0)
    return(doc0_df)

# test-run unit func on nokia[0]
%time doc0_df = vader_unit_func(df1.Review.iloc[0])
doc0_df

Wall time: 17 ms


Unnamed: 0,sent_index,neg,neu,pos,compound,sentence
0,0,0.0,1.0,0.0,0.0,"For half an hour, we tried EVERY UBER SERVICE ..."
1,1,0.0,0.876,0.124,0.1406,Uber FALSELY advertised drop-offs by certain t...
2,2,0.0,1.0,0.0,0.0,"Finally, we downloded Lyft and immediately got..."


In [13]:
# define wrapper func
def vader_wrap_func(corpus0):
    
    # use ifinstance() to check & convert input to DF
    if isinstance(corpus0, list):
        corpus0 = pd.DataFrame({'text':corpus0})
    
    # define empty DF to concat unit func output to
    vs_df = pd.DataFrame(columns=['doc_index', 'sent_index', 'neg', 'neu', 'pos', 'compound', 'sentence'])    
    
    # apply unit-func to each doc & loop over all docs
    for i1 in range(len(corpus0)):
        doc0 = str(corpus0.iloc[i1])
        vs_doc_df = vader_unit_func(doc0)  # applying unit-func
        vs_doc_df.insert(0, 'doc_index', i1)  # inserting doc index
        vs_df = pd.concat([vs_df, vs_doc_df], axis=0)
        
    return(vs_df)

# test-drive wrapper func
%time sentiment_df = vader_wrap_func(df1.Review)    
sentiment_df

Wall time: 1.8 s


Unnamed: 0,doc_index,sent_index,neg,neu,pos,compound,sentence
0,0,0,0.000,1.000,0.000,0.0000,"For half an hour, we tried EVERY UBER SERVICE ..."
1,0,1,0.000,0.876,0.124,0.1406,Uber FALSELY advertised drop-offs by certain t...
2,0,2,0.000,1.000,0.000,0.0000,"Finally, we downloded Lyft and immediately got..."
0,1,0,0.136,0.864,0.000,-0.2960,If I’m not eligible for the offer Stop floodin...
0,2,0,0.179,0.821,0.000,-0.3400,Consistently inaccurate Uber Eats ETA and the ...
...,...,...,...,...,...,...,...
2,487,2,0.000,1.000,0.000,0.0000,"When I tried to use it, I got a message saying..."
3,487,3,0.228,0.772,0.000,-0.7096,"There is no place in the app to report issues,..."
4,487,4,0.000,0.924,0.076,0.3679,"For getting a ride, this is a decent app, but ..."
5,487,5,0.000,0.674,0.326,0.4404,"I’ll be running with Grub Hub, thanks."
