In [28]:
# !pip install vaderSentiment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import timea
import re

ModuleNotFoundError: No module named 'timea'

### 0. Understanding the Business Problem
Uber Inc in the US wants to know:

- the major complaints premium users have about their cab services,
- how these impact service ratings.

We as (technical) consultants to Uber. have to:  
- [a] analyze text reviews of Uber cabs’ US services,  
- [b] relate whether and which different features of these reviews impact overall ratings  
- [c] pinpoint possible areas of improvement.

### 1. Pre-processing: 
- Examine the dataset. 
- ID the columns of interest. 
- Drop special characters, html junk etc. 
- Perform any other preprocessing and text-cleaning activity you think fits this context.

In [2]:
file_path = r"uber_reviews_itune.csv"

In [3]:
df = pd.read_csv(file_path,
                 encoding='cp1252')
df.head()

Unnamed: 0,Author_Name,Title,Author_URL,App_Version,Rating,Review,Date
0,#NEVERUBER,Dishonest and Disgusting,https://itunes.apple.com/us/reviews/id663331949,3.434.10005,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,$$Heaven,Free offer,https://itunes.apple.com/us/reviews/id810421958,3.434.10005,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,.Disappointed....,Inaccurate,https://itunes.apple.com/us/reviews/id49598333,3.439.10000,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,.i. andrea,bad,https://itunes.apple.com/us/reviews/id689880334,3.434.10005,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,-:deka:-,Double charged me for an order,https://itunes.apple.com/us/reviews/id124963835,3.434.10005,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


Columns of interest:  
1. Title - Brief summary about the review
2. Rating - Label for supervised learning
3. Review - To extract the sentiment of the complaint
4. Date - Extracting weekday or weekend may give better insight on nature of review

### Data Cleaning

In [4]:
df1 = df.drop(['Author_Name','Author_URL','App_Version'],axis=1)
df1.head()

Unnamed: 0,Title,Rating,Review,Date
0,Dishonest and Disgusting,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,Free offer,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,Inaccurate,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,bad,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,Double charged me for an order,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


In [5]:
# Replacing emoticon with its respective meaning
to_replace=['<U+0001F621>','<U+0001F615>','<U+0001F44E>']
replace_with=['pouting face','confused face','thumbs down']
df1.Review=df1.Review.replace(to_replace, replace_with, regex=True)

In [6]:
df1.dtypes

Title     object
Rating     int64
Review    object
Date      object
dtype: object

In [9]:
df1[df1.Review.str.contains(r'\<U*')].Review

1      If I’m not eligible for the offer Stop floodin...
37     i like uber i really do but lyft has better pr...
89     Yesterday I contacted Uber service and I paid ...
132    ok so i ise uber to get to work 7/10 times a w...
145    how am I supposed to used this if I can’t put ...
149    <U+062A><U+062C><U+0627><U+0631><U+0628><U+064...
190    This app has a lot of bugs , it seems like all...
208    Why are there no riders all of a sudden hmmm <...
214    Uber charged me for a ride a didn’t take for $...
237    How do I delete my account, all I wanted to do...
357    If I could give it no stars I would twice i ha...
367    Seen a billboard that said “If you tolerate ra...
439    I literally requested a ride for only $9 on 01...
Name: Review, dtype: object

In [10]:
df1.Review = df1.Review.str.split('<').str[0]
df1.shape

(490, 4)

In [11]:
df1['Review'].replace('', np.nan, inplace=True)
df1.dropna(subset=['Review'], inplace=True)
df1.shape

(489, 4)

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# define unit func to process one doc
from nltk import sent_tokenize, word_tokenize

def vader_unit_func(doc0,column_name):
    sents_list0 = sent_tokenize(doc0)
    vs_doc0 = []
    sent_ind = []
    for i in range(len(sents_list0)):
        vs_sent0 = analyzer.polarity_scores(sents_list0[i])
        vs_doc0.append(vs_sent0)
        sent_ind.append(i)
        
    # obtain output as DF    
    doc0_df = pd.DataFrame(vs_doc0)
    doc0_df.columns = [x+column_name for x in doc0_df.columns]
    doc0_df.insert(0, 'sent_index', sent_ind)  # insert sent index
    doc0_df.insert(doc0_df.shape[1], 'sentence', sents_list0)
    return(doc0_df)

# define wrapper func
def vader_wrap_func(corpus0,column_name):
    
    # use ifinstance() to check & convert input to DF
    if isinstance(corpus0, list):
        corpus0 = pd.DataFrame({'text':corpus0})
    
    # define empty DF to concat unit func output to
    vs_df = pd.DataFrame()    
    
    # apply unit-func to each doc & loop over all docs
    for i1 in range(len(corpus0)):
        doc0 = str(corpus0.iloc[i1])
        vs_doc_df = vader_unit_func(doc0,column_name)  # applying unit-func
        vs_doc_df.insert(0, 'doc_index', i1)  # inserting doc index
        vs_df = pd.concat([vs_df, vs_doc_df], axis=0)
        
    return(vs_df)

In [14]:
review_sentiment = vader_wrap_func(df1.Review,'_review').groupby('doc_index').sum()

In [22]:
review_sentiment

Unnamed: 0_level_0,sent_index,neg_review,neu_review,pos_review,compound_review
doc_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3,0.000,2.876,0.124,0.1406
1,0,0.136,0.864,0.000,-0.2960
2,0,0.179,0.821,0.000,-0.3400
3,10,1.167,3.592,0.241,-0.1617
4,21,0.908,5.614,0.478,-0.4906
...,...,...,...,...,...
484,0,0.145,0.855,0.000,-0.2960
485,3,0.377,2.381,0.242,-0.3160
486,15,0.672,4.864,0.464,-0.0993
487,15,0.334,5.264,0.402,-0.2625


In [17]:
review_sentiment.isna().sum()

sent_index         0
neg_review         0
neu_review         0
pos_review         0
compound_review    0
dtype: int64

In [18]:
title_sentiment = vader_wrap_func(df1.Title,'_title').groupby('doc_index').sum()

In [20]:
title_sentiment

Unnamed: 0_level_0,sent_index,neg_title,neu_title,pos_title,compound_title
doc_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0.877,0.123,0.000,-0.7964
1,0,0.000,0.233,0.767,0.5106
2,0,0.000,1.000,0.000,0.0000
3,0,1.000,0.000,0.000,-0.5423
4,0,0.265,0.735,0.000,-0.2023
...,...,...,...,...,...
484,0,0.000,1.000,0.000,0.0000
485,0,1.000,0.000,0.000,-0.7783
486,0,0.000,1.000,0.000,0.0000
487,0,0.304,0.696,0.000,-0.5423


In [19]:
title_sentiment.isna().sum()

sent_index        0
neg_title         0
neu_title         0
pos_title         0
compound_title    0
dtype: int64

In [23]:
# test-drive wrapper func
review_sentiment = vader_wrap_func(df1.Review,'_review').groupby('doc_index').sum()
title_sentiment = vader_wrap_func(df1.Title,'_title').groupby('doc_index').sum()
df1 = pd.concat([df1,review_sentiment,title_sentiment],axis=1)
print(df1.shape)
df1.head()

(490, 14)


Unnamed: 0,Title,Rating,Review,Date,sent_index,neg_review,neu_review,pos_review,compound_review,sent_index.1,neg_title,neu_title,pos_title,compound_title
0,Dishonest and Disgusting,1.0,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14,3.0,0.0,2.876,0.124,0.1406,0.0,0.877,0.123,0.0,-0.7964
1,Free offer,2.0,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17,0.0,0.136,0.864,0.0,-0.296,0.0,0.0,0.233,0.767,0.5106
2,Inaccurate,2.0,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38,0.0,0.179,0.821,0.0,-0.34,0.0,0.0,1.0,0.0,0.0
3,bad,1.0,i had my rides canceled back to back. they the...,08-12-2020 01:01,10.0,1.167,3.592,0.241,-0.1617,0.0,1.0,0.0,0.0,-0.5423
4,Double charged me for an order,1.0,Two of the same orders was added by accident. ...,15-12-2020 04:02,21.0,0.908,5.614,0.478,-0.4906,0.0,0.265,0.735,0.0,-0.2023


In [26]:
df1.dropna(inplace=True)

In [27]:
df1.isna().sum()

Title              0
Rating             0
Review             0
Date               0
sent_index         0
neg_review         0
neu_review         0
pos_review         0
compound_review    0
sent_index         0
neg_title          0
neu_title          0
pos_title          0
compound_title     0
dtype: int64

In [28]:
df1.Date.head()

0    29-12-2020 01:14
1    01-01-2021 23:17
2    15-01-2021 23:38
3    08-12-2020 01:01
4    15-12-2020 04:02
Name: Date, dtype: object

In [29]:
# Converting Date into datetime format
df1['Date'] =  pd.to_datetime(df1['Date'], format='%d-%m-%Y %H:%M')
df1.Date.head()

0   2020-12-29 01:14:00
1   2021-01-01 23:17:00
2   2021-01-15 23:38:00
3   2020-12-08 01:01:00
4   2020-12-15 04:02:00
Name: Date, dtype: datetime64[ns]

In [30]:
df1['Isweekend'] = np.where(df1.Date.dt.dayofweek>4,1,0)
df1['Late_night'] = np.where(df1.Date.dt.hour<4,1,0)
df1['Early_mrng'] = np.where(df1.Date.dt.hour.between(4,8),1,0)
df1['Morning'] = np.where(df1.Date.dt.hour.between(8,12),1,0)
df1['Noon'] = np.where(df1.Date.dt.hour.between(12,16),1,0)
df1['Eve'] = np.where(df1.Date.dt.hour.between(16,20),1,0)
df1['Night'] = np.where(df1.Date.dt.hour>20,1,0)
df1.head()

Unnamed: 0,Title,Rating,Review,Date,sent_index,neg_review,neu_review,pos_review,compound_review,sent_index.1,...,neu_title,pos_title,compound_title,Isweekend,Late_night,Early_mrng,Morning,Noon,Eve,Night
0,Dishonest and Disgusting,1.0,"For half an hour, we tried EVERY UBER SERVICE ...",2020-12-29 01:14:00,3.0,0.0,2.876,0.124,0.1406,0.0,...,0.123,0.0,-0.7964,0,1,0,0,0,0,0
1,Free offer,2.0,If I’m not eligible for the offer Stop floodin...,2021-01-01 23:17:00,0.0,0.136,0.864,0.0,-0.296,0.0,...,0.233,0.767,0.5106,0,0,0,0,0,0,1
2,Inaccurate,2.0,Consistently inaccurate Uber Eats ETA and the ...,2021-01-15 23:38:00,0.0,0.179,0.821,0.0,-0.34,0.0,...,1.0,0.0,0.0,0,0,0,0,0,0,1
3,bad,1.0,i had my rides canceled back to back. they the...,2020-12-08 01:01:00,10.0,1.167,3.592,0.241,-0.1617,0.0,...,0.0,0.0,-0.5423,0,1,0,0,0,0,0
4,Double charged me for an order,1.0,Two of the same orders was added by accident. ...,2020-12-15 04:02:00,21.0,0.908,5.614,0.478,-0.4906,0.0,...,0.735,0.0,-0.2023,0,0,1,0,0,0,0


In [31]:
df1=df1.drop(['sent_index','Title','Review','Date'],axis=1)
df1.head()

Unnamed: 0,Rating,neg_review,neu_review,pos_review,compound_review,neg_title,neu_title,pos_title,compound_title,Isweekend,Late_night,Early_mrng,Morning,Noon,Eve,Night
0,1.0,0.0,2.876,0.124,0.1406,0.877,0.123,0.0,-0.7964,0,1,0,0,0,0,0
1,2.0,0.136,0.864,0.0,-0.296,0.0,0.233,0.767,0.5106,0,0,0,0,0,0,1
2,2.0,0.179,0.821,0.0,-0.34,0.0,1.0,0.0,0.0,0,0,0,0,0,0,1
3,1.0,1.167,3.592,0.241,-0.1617,1.0,0.0,0.0,-0.5423,0,1,0,0,0,0,0
4,1.0,0.908,5.614,0.478,-0.4906,0.265,0.735,0.0,-0.2023,0,0,1,0,0,0,0


In [32]:
df1.isna().sum()

Rating             0
neg_review         0
neu_review         0
pos_review         0
compound_review    0
neg_title          0
neu_title          0
pos_title          0
compound_title     0
Isweekend          0
Late_night         0
Early_mrng         0
Morning            0
Noon               0
Eve                0
Night              0
dtype: int64

### Preliminary Regression Model


In [33]:
y = df1.Rating
X = df1.drop('Rating', axis=1)
y.shape, X.shape

((488,), (488, 15))

In [34]:
X.isnull().sum()

neg_review         0
neu_review         0
pos_review         0
compound_review    0
neg_title          0
neu_title          0
pos_title          0
compound_title     0
Isweekend          0
Late_night         0
Early_mrng         0
Morning            0
Noon               0
Eve                0
Night              0
dtype: int64

In [35]:
import statsmodels.api as sm
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rating,R-squared:,0.029
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.928
Date:,"Sat, 09 Jul 2022",Prob (F-statistic):,0.533
Time:,21:49:35,Log-Likelihood:,-690.3
No. Observations:,488,AIC:,1413.0
Df Residuals:,472,BIC:,1480.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5678,0.312,5.025,0.000,0.955,2.181
neg_review,-0.0408,0.170,-0.240,0.811,-0.375,0.293
neu_review,-0.0174,0.020,-0.858,0.391,-0.057,0.022
pos_review,0.1281,0.204,0.627,0.531,-0.273,0.529
compound_review,0.0018,0.103,0.018,0.986,-0.201,0.204
neg_title,0.0291,0.387,0.075,0.940,-0.732,0.790
neu_title,-0.0235,0.232,-0.101,0.919,-0.478,0.431
pos_title,0.0192,0.413,0.046,0.963,-0.793,0.831
compound_title,0.2911,0.365,0.798,0.425,-0.425,1.008

0,1,2,3
Omnibus:,205.607,Durbin-Watson:,2.117
Prob(Omnibus):,0.0,Jarque-Bera (JB):,630.736
Skew:,2.077,Prob(JB):,1.09e-137
Kurtosis:,6.71,Cond. No.,52.5


## Next Steps:
1. Convert non-English reviews to English or use non-english dictionary
2. Scale the emoticons replacement
3. Make sentiment analysis of Title - Done
4. From Date, extract weekend, weekday, morning, afternoon, evening, night - Done
5. Make preliminary regression model with y variable as Ratings
6. ?Use OLS
7. Feature Engineering - columns on specific word count
8. Shiny App