# **Summary**: the purpose of this project is to check whether reviews in the AppStore have any correlation (or predictive influence) with the App's overall score (1 to 5 stars). 
# 
# **Motivation**: if a significant relationship can be established, App creators will know that reviews are an important source of data and that, perhaps, more focus should be put into interacting with users and analyzing feedback, therefore more efficiently utilizing time and money based resources
# 
# **Method**: the datasets used come from fellow Kaggle user Lavanya Gupta under the title "Google Play Store Apps - Web scraped data of 10k Play Store apps for analysing the Android market."

In [None]:
import pandas as pd

#reading the 2 datasets

data1 = pd.read_csv('../input/googleplaystore-data/googleplaystore.csv')
data2 = pd.read_csv('../input/googleplaystore-data/googleplaystore_user_reviews.csv')


In [None]:
#dataset 1

data1.head(5)

In [None]:
#dataset 2
#It can be noticed that dataset 2 presents 3 columns : Sentiment, Sentiment_Polarity, Sentiment_Subjectivity which show the magnitude
# of a review and give it a positiveness score. I'm planning to do my own analysis using the textblob library to verify the results

data2.head(5)

In [None]:
#using textblob
#After running the code block you'll notice that we get the exact same polarity and subjectivity as the original daataset creator which
# could suggest that we used similar methods. Safe to say, the sentiment analysis looks good.
from textblob import TextBlob

data = data2
data = data.dropna()

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

data['Polarity'] = data['Translated_Review'].apply(getPolarity)
data['Subjectivity'] = data['Translated_Review'].apply(getSubjectivity)

print(data.head(5))

In [None]:
#Eventually, we will have to combine the datasets, so now would be a good time to check if they're compatible for a merge

#Dataset #2 has many more rows, but this is to be expected. Data set #1 at first glance contains a list of unique app names
# and their attributes, and dataset #2 contains multiple reviews per same-app name so this will result in more occurances.

print(data1.shape)
print(data2.shape)

# At this point, I had to make a decision.
In order to combine the datasets into one, model-ready dataset, I had to find a way to make them compatible. I concluded that I'm going to "compress" dataset #2 by taking the average of the sentiment score per unique app-name and thus obtaining values that could be added to dataset #1

In [None]:
#drop missing values in both datasets
data1 = data1.dropna()
data2 = data2.dropna()

In [None]:
#create new dataset with average sentiment polarity per unique app name

#it apears that out of the ~11,000 data points we found in dataset1, we only have reviews for 865 of them. This greatly reduces our dataset.

temp_data = data2.groupby('App')['Sentiment_Polarity'].mean().to_frame('Sentiment').reset_index()
print(temp_data)

In [None]:
#merge data set 1 aand new data set on app name
data = temp_data.merge(data1,  on='App', how='left')
data = data.dropna()
data.head(10)

In [None]:
#remove duplicates
data = data.drop_duplicates()
data.head(5)

In [None]:
#new shape
#Notice that the new number of rows 1078 is bigger than 865 a few blocks ago even after removing missing value rows and duplicates.
# This may be because some apps, even though they have the same naame, might be present in different categories.

data.shape

In [None]:
#drop unneeded features
data = data.drop(['Genres','Last Updated','Current Ver','Android Ver'],axis=1)
data.head(5)

In [None]:
#simplify 33 categories into 7 (based on personal opinion)
# eg: game, sports, and comics will be 'Entertainment'

data = data.replace({'Category' : {'SHOPPING' : 'LIFESTYLE', 'HEALTH_AND_FITNESS':'EDUCATIONAL',
                                  'GAME':'ENTERTAINMENT','SPORTS':'ENTERTAINMENT','COMICS':'ENTERTAINMENT',
                                  'FOOD_AND_DRINK':'ENTERTAINMENT','HEALTH_AND_FITNESS':'EDUCATIONAL',
                                  'MEDICAL':'EDUCATIONAL','FINANCE':'EDUCATIONAL','EDUCATION':'EDUCATIONAL',
                                  'BUSINESS':'EDUCATIONAL','NEWS_AND_MAGAZINES':'INFORMATIONAL',
                                  'WEATHER':'INFORMATIONAL','MAPS_AND_NAVIGATION':'INFORMATIONAL',
                                  'HOUSE_AND_HOME':'INFORMATIONAL','PARENTING':'INFORMATIONAL',
                                  'COMMUNICATION':'SOCIAL','DATING':'SOCIAL','FAMILY':'SOCIAL',
                                  'EVENTS':'SOCIAL','TRAVEL_AND_LOCAL':'SOCIAL','BEAUTY':'ARTnBEAUTY',
                                  'PHOTOGRAPHY':'ARTnBEAUTY','ART_AND_DESIGN':'ARTnBEAUTY',
                                  'PRODUCTIVITY':'LIFESTYLE','PERSONALIZATION':'LIFESTYLE',
                                  'BOOKS_AND_REFERENCE':'TOOLS','VIDEO_PLAYERS':'ENTERTAINMENT',
                                  'AUTO_AND_VEHICLES':'TOOLS','LIBRARIES_AND_DEMO':'TOOLS'}})
data.head(5)

In [None]:
#transform number of reviews in int

data['Reviews']  = data['Reviews'].apply(lambda x: int(x))
data.head(5)

In [None]:
#transform size of app in float 

drrp = data
drrp['Size'] = drrp['Size'].apply(lambda x: str(x).replace('Varies with device', 'NaN') if 'Varies with device' in str(x) else x)
drrp['Size'] = drrp['Size'].apply(lambda x: str(x).replace('M', '') if 'M' in str(x) else x)
drrp['Size'] = drrp['Size'].apply(lambda x: str(x).replace(',', '') if 'M' in str(x) else x)
drrp['Size'] = drrp['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
drrp['Size'] = drrp['Size'].apply(lambda x: float(x))

drrp = drrp.dropna()
drrp.head(5)

In [None]:
#transform number of installs in int

df = drrp
df['Installs'] = df['Installs'].apply(lambda x: x.replace('+', '') if '+' in str(x) else x)
df['Installs'] = df['Installs'].apply(lambda x: x.replace(',', '') if ',' in str(x) else x)
df['Installs'] = df['Installs'].apply(lambda x: int(x))

df.head(5)

In [None]:
#with type and price is more complicated so let's check if there's aany point to consider them
# only 1.5% of values are paid so it might not be a baad idea to remove those attributes for simplicity
df['Type'].value_counts()

In [None]:
#drop type and price due to the fact that we have too few paid values
df = df.drop(['Type','Price'],axis=1)

#drop app name as it is no longer important
df =  df.drop(labels=['App'],axis=1)

In [None]:
#clean dataset
df.head(5)

In [None]:
#Correlation matrix on numerical data
# not very promising results. There doesn't seem to be any attribute that greatly correlates with Raating. However, the sentiment does have the better score
# which is good. No great correlation between the other attributes either, other than size and reviews with a .42 being the highest
data = df
data[['Rating','Sentiment','Reviews','Size','Installs']].corr()

In [None]:
#Create dummies for category variable and content rating variable
dummydata = data
dummycategory = pd.get_dummies(dummydata.Category)
dummycontentage =  pd.get_dummies(dummydata['Content Rating'])

#Replace initial category and content rating columns with the newly created dummy columns

dummydata = dummydata.drop(labels=['Category','Content Rating'],axis=1)

frames = [dummydata,  dummycategory, dummycontentage]
dummydatas = pd.concat(frames,  axis=1)

dummydatas.head(5)

In [None]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import  math
from random import randint
import re
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib
matplotlib.axes.Axes.pie
matplotlib.pyplot.pie

In [None]:
#Regression model 1 tests all the attributes without sentiment on rating

data2 = dummydatas

#initialize dependent and independent variables
Y = data2['Rating']
X_nsent = data2.drop(labels=['Rating','Sentiment'],axis=1)

#divide data training 80% and test 20%
X_nsent_train, X_nsent_test, Y_train, Y_test = train_test_split(X_nsent,Y,test_size=0.2)

#create model
model = linear_model.LinearRegression()

model.fit(X_nsent_train, Y_train)

Y_pred = model.predict(X_nsent_test)

print('Coefficients ',model.coef_)
print('Intercept ',model.intercept_)
print('Mean squared error (MSE): %.2f'  %  mean_squared_error(Y_test,Y_pred))
print('Coefficient of determination (R^2): %.2f' % r2_score(Y_test,Y_pred))

In [None]:
#Regression model 2 tests all the attributes 
data2 = dummydatas

Y = pd.DataFrame(data2['Rating'])
X_sent = data2.drop(labels=['Rating'],axis=1)

X_sent_train, X_sent_test, Y_train, Y_test = train_test_split(X_sent,Y,test_size=0.2)

model = linear_model.LinearRegression()

model.fit(X_sent_train, Y_train)

Y_pred = model.predict(X_sent_test)

print('Coefficients ',model.coef_)
print('Intercept ',model.intercept_)
print('Mean squared error (MSE): %.2f'  %  mean_squared_error(Y_test,Y_pred))
print('Coefficient of determination (R^2): %.2f' % r2_score(Y_test,Y_pred))

In [None]:
#Regression model 3 tests sentiment on rating
data2 = dummydatas

Y = pd.DataFrame(data2['Rating'])
X = pd.DataFrame(data2['Sentiment'])

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

model = linear_model.LinearRegression()

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print('Coefficients ',model.coef_)
print('Intercept ',model.intercept_)
print('Mean squared error (MSE): %.2f'  %  mean_squared_error(Y_test,Y_pred))
print('Coefficient of determination (R^2): %.2f' % r2_score(Y_test,Y_pred))

# Best model seems to include all attributes, including sentiment, as denoted by the coefficient of determination.