In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

### 0. Understanding the Business Problem
Uber Inc in the US wants to know:

- the major complaints premium users have about their cab services,
- how these impact service ratings.

We as (technical) consultants to Uber. have to:  
- [a] analyze text reviews of Uber cabs’ US services,  
- [b] relate whether and which different features of these reviews impact overall ratings  
- [c] pinpoint possible areas of improvement.

### 1. Pre-processing: 
- Examine the dataset. 
- ID the columns of interest. 
- Drop special characters, html junk etc. 
- Perform any other preprocessing and text-cleaning activity you think fits this context.

In [2]:
df = pd.read_csv(r"G:\ISB AMPBA\9. Introduction to Text Analytics\Assignment\uber_reviews_itune.csv",
                 encoding='cp1252')
df.head()

Unnamed: 0,Author_Name,Title,Author_URL,App_Version,Rating,Review,Date
0,#NEVERUBER,Dishonest and Disgusting,https://itunes.apple.com/us/reviews/id663331949,3.434.10005,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,$$Heaven,Free offer,https://itunes.apple.com/us/reviews/id810421958,3.434.10005,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,.Disappointed....,Inaccurate,https://itunes.apple.com/us/reviews/id49598333,3.439.10000,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,.i. andrea,bad,https://itunes.apple.com/us/reviews/id689880334,3.434.10005,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,-:deka:-,Double charged me for an order,https://itunes.apple.com/us/reviews/id124963835,3.434.10005,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


Columns of interest:  
1. Title - Brief summary about the review
2. Rating - Label for supervised learning
3. Review - To extract the sentiment of the complaint
4. Date - Extracting weekday or weekend may give better insight on nature of review

### Data Cleaning

In [3]:
df1 = df.drop(['Author_Name','Author_URL','App_Version'],axis=1)
df1.head()

Unnamed: 0,Title,Rating,Review,Date
0,Dishonest and Disgusting,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,Free offer,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,Inaccurate,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,bad,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,Double charged me for an order,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


In [4]:
to_replace=['<U+0001F621>','<U+0001F615>','<U+0001F44E>']
replace_with=['pouting face','confused face','thumbs down']
df1.Review=df1.Review.replace(to_replace, replace_with, regex=True)

In [5]:
df1.Review = df1.Review.str.split('<').str[0]
df1.shape

(490, 4)

In [9]:
# drop empty rows or docs
df1.Review[149]

''

In [11]:
df1['Review'].replace('', np.nan, inplace=True)
df1.dropna(subset=['Review'], inplace=True)
df1.shape

(489, 4)

### Tokenization and Stemming

Below we import NLTK's sentence and word tokenizer, and stemmer. Note the use of list comprehension to bundle both into one  line of efficient code.

Note also the use of regex from *re* to detect and drop any non alphabetic characters from the corpus.

Find below two straightforward user defined funcs to tokenize (and stem).

We will apply these funcs on each doc in the corpus subsequently.

In [13]:
# load nltk's English stopwords as variable called 'stopwords'
import nltk, re, requests
stopwords = nltk.corpus.stopwords.words('english')

# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

## here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) using regex
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [16]:
# Use above funcs to iterate over the list of synopses to create two vocabularies: one stemmed and one only tokenized. 
totalvocab_stemmed = []
totalvocab_tokenized = []
reviews = df1.Review.tolist()

t0 = time.time()
for i in reviews:
    
    # doing both toknz & stemming
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    # doing toknz only
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
    
t1 = time.time()
print(round(t1-t0, 3))    # 0.2 s

2.343


In [21]:
## create a pandas DataFrame with the stemmed vocabulary as the index and the tokenized words as the column
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized})
vocab_frame

Unnamed: 0,words
0,for
1,half
2,an
3,hour
4,we
...,...
30606,i
30607,m
30608,done
30609,using


In [18]:
## Tf-idf and document similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# defining parms for the tfidf-tokenizer here
tfidf_vectorizer = TfidfVectorizer(max_df=1, # max proportion of docs word is present in
                                   max_features=200000,
                                   min_df=0, 
                                   stop_words='english',
                                   use_idf=True, 
                                   tokenizer=tokenize_and_stem, 
                                   ngram_range=(1,3))

# note magic cmd %time
%time tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)    # 6.05 secs

print(tfidf_matrix.shape)    # dimns of the tfidf matrix



Wall time: 1.91 s
(489, 23674)


In [19]:
terms = tfidf_vectorizer.get_feature_names()
terms[:20]

["'d",
 "'d enrol",
 "'d enrol reoccur",
 "'m illinoi",
 "'m illinoi inland",
 "'m late",
 "'m late person",
 "'m pretti",
 "'m pretti late",
 "'m rich",
 "'m rich use",
 "'m sick",
 "'m sick fake",
 "'m talk",
 "'m talk just",
 "'m told",
 "'m told email",
 "'s lyft",
 "'s lyft larg",
 "'s minut"]

In [20]:
print(type(tfidf_matrix))

tfidf_matrix.todense()[:5,:5]

<class 'scipy.sparse.csr.csr_matrix'>


matrix([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [23]:
tfidf_matrix.shape

(489, 23674)

In [25]:
tfidf_matrix.todense()[:10,:10]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])