In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

### 0. Understanding the Business Problem
Uber Inc in the US wants to know:

- the major complaints premium users have about their cab services,
- how these impact service ratings.

We as (technical) consultants to Uber. have to:  
- [a] analyze text reviews of Uber cabs’ US services,  
- [b] relate whether and which different features of these reviews impact overall ratings  
- [c] pinpoint possible areas of improvement.

### 1. Pre-processing: 
- Examine the dataset. 
- ID the columns of interest. 
- Drop special characters, html junk etc. 
- Perform any other preprocessing and text-cleaning activity you think fits this context.

In [2]:
df = pd.read_csv(r"G:\ISB AMPBA\9. Text Analytics\Assignment\uber_reviews_itune.csv",
                 encoding='cp1252')
df.head()

Unnamed: 0,Author_Name,Title,Author_URL,App_Version,Rating,Review,Date
0,#NEVERUBER,Dishonest and Disgusting,https://itunes.apple.com/us/reviews/id663331949,3.434.10005,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,$$Heaven,Free offer,https://itunes.apple.com/us/reviews/id810421958,3.434.10005,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,.Disappointed....,Inaccurate,https://itunes.apple.com/us/reviews/id49598333,3.439.10000,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,.i. andrea,bad,https://itunes.apple.com/us/reviews/id689880334,3.434.10005,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,-:deka:-,Double charged me for an order,https://itunes.apple.com/us/reviews/id124963835,3.434.10005,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


Columns of interest:  
1. Title - Brief summary about the review
2. Rating - Label for supervised learning
3. Review - To extract the sentiment of the complaint
4. Date - Extracting weekday or weekend may give better insight on nature of review

### Data Cleaning

In [3]:
df1 = df.drop(['Author_Name','Author_URL','App_Version'],axis=1)
df1.head()

Unnamed: 0,Title,Rating,Review,Date
0,Dishonest and Disgusting,1,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14
1,Free offer,2,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17
2,Inaccurate,2,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38
3,bad,1,i had my rides canceled back to back. they the...,08-12-2020 01:01
4,Double charged me for an order,1,Two of the same orders was added by accident. ...,15-12-2020 04:02


In [4]:
# Replacing emoticon with its respective meaning
df_emojis = pd.read_csv(r"G:\ISB AMPBA\9. Text Analytics\Assignment\emoji_description.csv")
df_emojis.head()

Unnamed: 0,Code,CLDR Short Name
0,<U+1F600>,grinning face
1,<U+1F603>,grinning face with big eyes
2,<U+1F604>,grinning face with smiling eyes
3,<U+1F601>,beaming face with smiling eyes
4,<U+1F606>,grinning squinting face


In [5]:
# Replacing emoticon with its respective meaning
to_replace=['<U+0001F621>','<U+0001F615>','<U+0001F44E>']
replace_with=['pouting face','confused face','thumbs down']
df1.Review=df1.Review.replace(to_replace, replace_with, regex=True)

In [6]:
df1[df1.Review.str.contains('<')]['Review'][1]

'If I’m not eligible for the offer Stop flooding my email with this false information <U+0001F621><U+0001F621><U+0001F621>'

In [7]:
df1.Review = df1.Review.str.split('<').str[0]
df1.shape

(490, 4)

In [8]:
df1['Review'].replace('', np.nan, inplace=True)
df1.dropna(subset=['Review'], inplace=True)
df1.shape

(489, 4)

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# define unit func to process one doc
from nltk import sent_tokenize, word_tokenize
def vader_unit_func(doc0,column_name):
    sents_list0 = sent_tokenize(doc0)
    vs_doc0 = []
    sent_ind = []
    for i in range(len(sents_list0)):
        vs_sent0 = analyzer.polarity_scores(sents_list0[i])
        vs_doc0.append(vs_sent0)
        sent_ind.append(i)
        
    # obtain output as DF    
    doc0_df = pd.DataFrame(vs_doc0)
    doc0_df.columns = [x+column_name for x in doc0_df.columns]
    doc0_df.insert(0, 'sent_index', sent_ind)  # insert sent index
    doc0_df.insert(doc0_df.shape[1], 'sentence', sents_list0)
    return(doc0_df)

# define wrapper func
def vader_wrap_func(corpus0,column_name):
    
    # use ifinstance() to check & convert input to DF
    if isinstance(corpus0, list):
        corpus0 = pd.DataFrame({'text':corpus0})
    
    # define empty DF to concat unit func output to
    vs_df = pd.DataFrame()    
    
    # apply unit-func to each doc & loop over all docs
    for i1 in range(len(corpus0)):
        doc0 = str(corpus0.iloc[i1])
        vs_doc_df = vader_unit_func(doc0,column_name)  # applying unit-func
        vs_doc_df.insert(0, 'doc_index', i1)  # inserting doc index
        vs_df = pd.concat([vs_df, vs_doc_df], axis=0)
        
    return(vs_df)

In [10]:
# test-drive wrapper func
review_sentiment = vader_wrap_func(df1.Review,'_review').groupby('doc_index').sum()
title_sentiment = vader_wrap_func(df1.Title,'_title').groupby('doc_index').sum()
df1 = pd.concat([df1,review_sentiment,title_sentiment],axis=1)
print(df1.shape)
df1.head()

(490, 14)


Unnamed: 0,Title,Rating,Review,Date,sent_index,neg_review,neu_review,pos_review,compound_review,sent_index.1,neg_title,neu_title,pos_title,compound_title
0,Dishonest and Disgusting,1.0,"For half an hour, we tried EVERY UBER SERVICE ...",29-12-2020 01:14,3.0,0.0,2.876,0.124,0.1406,0.0,0.877,0.123,0.0,-0.7964
1,Free offer,2.0,If I’m not eligible for the offer Stop floodin...,01-01-2021 23:17,0.0,0.136,0.864,0.0,-0.296,0.0,0.0,0.233,0.767,0.5106
2,Inaccurate,2.0,Consistently inaccurate Uber Eats ETA and the ...,15-01-2021 23:38,0.0,0.179,0.821,0.0,-0.34,0.0,0.0,1.0,0.0,0.0
3,bad,1.0,i had my rides canceled back to back. they the...,08-12-2020 01:01,10.0,1.167,3.592,0.241,-0.1617,0.0,1.0,0.0,0.0,-0.5423
4,Double charged me for an order,1.0,Two of the same orders was added by accident. ...,15-12-2020 04:02,21.0,0.908,5.614,0.478,-0.4906,0.0,0.265,0.735,0.0,-0.2023


In [11]:
# Converting Date into datetime format
df1['Date'] =  pd.to_datetime(df1['Date'], format='%d-%m-%Y %H:%M')
df1.Date.head()

0   2020-12-29 01:14:00
1   2021-01-01 23:17:00
2   2021-01-15 23:38:00
3   2020-12-08 01:01:00
4   2020-12-15 04:02:00
Name: Date, dtype: datetime64[ns]

In [12]:
df1['Isweekend'] = np.where(df1.Date.dt.dayofweek>4,1,0)
df1['Late_night'] = np.where(df1.Date.dt.hour<4,1,0)
df1['Early_mrng'] = np.where(df1.Date.dt.hour.between(4,8),1,0)
df1['Morning'] = np.where(df1.Date.dt.hour.between(8,12),1,0)
df1['Noon'] = np.where(df1.Date.dt.hour.between(12,16),1,0)
df1['Eve'] = np.where(df1.Date.dt.hour.between(16,20),1,0)
df1['Night'] = np.where(df1.Date.dt.hour>20,1,0)
df1.head()

Unnamed: 0,Title,Rating,Review,Date,sent_index,neg_review,neu_review,pos_review,compound_review,sent_index.1,...,neu_title,pos_title,compound_title,Isweekend,Late_night,Early_mrng,Morning,Noon,Eve,Night
0,Dishonest and Disgusting,1.0,"For half an hour, we tried EVERY UBER SERVICE ...",2020-12-29 01:14:00,3.0,0.0,2.876,0.124,0.1406,0.0,...,0.123,0.0,-0.7964,0,1,0,0,0,0,0
1,Free offer,2.0,If I’m not eligible for the offer Stop floodin...,2021-01-01 23:17:00,0.0,0.136,0.864,0.0,-0.296,0.0,...,0.233,0.767,0.5106,0,0,0,0,0,0,1
2,Inaccurate,2.0,Consistently inaccurate Uber Eats ETA and the ...,2021-01-15 23:38:00,0.0,0.179,0.821,0.0,-0.34,0.0,...,1.0,0.0,0.0,0,0,0,0,0,0,1
3,bad,1.0,i had my rides canceled back to back. they the...,2020-12-08 01:01:00,10.0,1.167,3.592,0.241,-0.1617,0.0,...,0.0,0.0,-0.5423,0,1,0,0,0,0,0
4,Double charged me for an order,1.0,Two of the same orders was added by accident. ...,2020-12-15 04:02:00,21.0,0.908,5.614,0.478,-0.4906,0.0,...,0.735,0.0,-0.2023,0,0,1,0,0,0,0


In [13]:
df1=df1.drop(['sent_index','Title','Review','Date'],axis=1)
df1.head()

Unnamed: 0,Rating,neg_review,neu_review,pos_review,compound_review,neg_title,neu_title,pos_title,compound_title,Isweekend,Late_night,Early_mrng,Morning,Noon,Eve,Night
0,1.0,0.0,2.876,0.124,0.1406,0.877,0.123,0.0,-0.7964,0,1,0,0,0,0,0
1,2.0,0.136,0.864,0.0,-0.296,0.0,0.233,0.767,0.5106,0,0,0,0,0,0,1
2,2.0,0.179,0.821,0.0,-0.34,0.0,1.0,0.0,0.0,0,0,0,0,0,0,1
3,1.0,1.167,3.592,0.241,-0.1617,1.0,0.0,0.0,-0.5423,0,1,0,0,0,0,0
4,1.0,0.908,5.614,0.478,-0.4906,0.265,0.735,0.0,-0.2023,0,0,1,0,0,0,0


In [14]:
# Removing null values
df1.dropna(inplace=True)

### Preliminary Regression Model


In [15]:
y = df1.Rating
X = df1.drop('Rating', axis=1)
y.shape, X.shape

((488,), (488, 15))

In [16]:
X.isnull().sum()

neg_review         0
neu_review         0
pos_review         0
compound_review    0
neg_title          0
neu_title          0
pos_title          0
compound_title     0
Isweekend          0
Late_night         0
Early_mrng         0
Morning            0
Noon               0
Eve                0
Night              0
dtype: int64

In [17]:
import statsmodels.api as sm
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Rating,R-squared:,0.029
Model:,OLS,Adj. R-squared:,-0.002
Method:,Least Squares,F-statistic:,0.928
Date:,"Sun, 10 Jul 2022",Prob (F-statistic):,0.533
Time:,19:37:15,Log-Likelihood:,-690.3
No. Observations:,488,AIC:,1413.0
Df Residuals:,472,BIC:,1480.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5678,0.312,5.025,0.000,0.955,2.181
neg_review,-0.0408,0.170,-0.240,0.811,-0.375,0.293
neu_review,-0.0174,0.020,-0.858,0.391,-0.057,0.022
pos_review,0.1281,0.204,0.627,0.531,-0.273,0.529
compound_review,0.0018,0.103,0.018,0.986,-0.201,0.204
neg_title,0.0291,0.387,0.075,0.940,-0.732,0.790
neu_title,-0.0235,0.232,-0.101,0.919,-0.478,0.431
pos_title,0.0192,0.413,0.046,0.963,-0.793,0.831
compound_title,0.2911,0.365,0.798,0.425,-0.425,1.008

0,1,2,3
Omnibus:,205.607,Durbin-Watson:,2.117
Prob(Omnibus):,0.0,Jarque-Bera (JB):,630.736
Skew:,2.077,Prob(JB):,1.09e-137
Kurtosis:,6.71,Cond. No.,52.5


In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((341, 16), (147, 16), (341,), (147,))

### Calculating VIF

In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):
   
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'].sort_values()

    return(vif)

In [20]:
calc_vif(X)

Unnamed: 0,variables,VIF
0,const,46.362385
1,neg_review,2.759723
2,neu_review,1.674147
3,pos_review,3.118308
4,compound_review,3.450706
5,neg_title,7.475174
6,neu_title,3.801279
7,pos_title,3.237319
8,compound_title,6.595158
9,Isweekend,1.035633


Generally, a VIF above 4 or tolerance below 0.25 indicates that multicollinearity might exist, and further investigation is required.   
When VIF is higher than 10 or tolerance is lower than 0.1, there is significant multicollinearity that needs to be corrected.  
  
Since all the above variables have VIF below 4 and above 0.25, we can be assured that there is no multicollinearity.

In [21]:
# We will save the model performance metrics in a DataFrame

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
Model = []
RMSE = []
R_sq = []
cv = KFold(5)

#Creating a Function to append the cross validation scores of the algorithms
def input_scores(name, model, x, y):
    Model.append(name)
    RMSE.append(np.sqrt((-1) * cross_val_score(model, x, y, cv=cv, 
                                               scoring='neg_mean_squared_error').mean()))
    R_sq.append(cross_val_score(model, x, y, cv=cv, scoring='r2').mean())

In [22]:
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor)

names = ['Linear Regression', 'Ridge Regression', 'Lasso Regression',
         'K Neighbors Regressor', 'Decision Tree Regressor', 
         'Random Forest Regressor', 'Gradient Boosting Regressor',
         'Adaboost Regressor','XGBRegressor']

models = [LinearRegression(), Ridge(), Lasso(),
          KNeighborsRegressor(), DecisionTreeRegressor(),
          RandomForestRegressor(), GradientBoostingRegressor(), 
          AdaBoostRegressor(),XGBRegressor()]

#Running all algorithms
for name, model in zip(names, models):
    input_scores(name, model, X_train, y_train)

Reference: https://www.kaggle.com/swatisinghalmav/best-of-8-regression-models-to-predict-strength

In [23]:
evaluation = pd.DataFrame({'Model': Model,'RMSE': RMSE,'R Squared': R_sq})
print("FOLLOWING ARE THE TRAINING SCORES: ")
evaluation

FOLLOWING ARE THE TRAINING SCORES: 


Unnamed: 0,Model,RMSE,R Squared
0,Linear Regression,1.059442,-0.110524
1,Ridge Regression,1.052398,-0.093347
2,Lasso Regression,1.013998,-0.009029
3,K Neighbors Regressor,1.149682,-0.322161
4,Decision Tree Regressor,1.420689,-1.006599
5,Random Forest Regressor,1.11792,-0.23038
6,Gradient Boosting Regressor,1.179087,-0.392682
7,Adaboost Regressor,1.153296,-0.30064
8,XGBRegressor,1.216718,-0.498903


## Next Steps:
1. Convert non-English reviews to English or use non-english dictionary
2. Scale the emoticons replacement
3. Make sentiment analysis of Title - Done
4. From Date, extract weekend, weekday, morning, afternoon, evening, night - Done
5. Make preliminary regression model with y variable as Ratings - Done
6. ?Use OLS - Done
7. Feature Engineering - columns on specific word count
8. Shiny App