In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import webbrowser
import os

In [2]:
# Load review dataset from specific location
review_data = pd.read_csv("D:/Google Play Store Intern/User Reviews.csv") 

In [3]:
review_data.head() # show top 5 rows of dataset

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462


In [4]:
review_data.shape 

(64295, 5)

In [5]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [6]:
review_data.describe()

Unnamed: 0,Sentiment_Polarity,Sentiment_Subjectivity
count,37432.0,37432.0
mean,0.182146,0.492704
std,0.351301,0.259949
min,-1.0,0.0
25%,0.0,0.357143
50%,0.15,0.514286
75%,0.4,0.65
max,1.0,1.0


In [7]:
review_data.describe(include = ['object']) # show description for only object type column 

Unnamed: 0,App,Translated_Review,Sentiment
count,64295,37427,37432
unique,1074,27994,3
top,Bowmasters,Good,Positive
freq,320,247,23998


In [8]:
review_data.isnull().sum() # number of null in each column
((review_data.isnull().sum().sum())/(review_data.shape[0]*review_data.shape[1]))*100 # total number of null
(review_data.isnull().sum()/review_data.shape[0])*100 # percentage null in each column

App                        0.000000
Translated_Review         41.788631
Sentiment                 41.780854
Sentiment_Polarity        41.780854
Sentiment_Subjectivity    41.780854
dtype: float64

In [9]:
review_data.dropna(inplace = True) # Drop all nulls

In [10]:
# convert object type columns to numeric type
review_data['Sentiment_Polarity'] = pd.to_numeric(review_data['Sentiment_Polarity'], errors = 'coerce')
review_data['Sentiment_Subjectivity'] = pd.to_numeric(review_data['Sentiment_Subjectivity'], errors = 'coerce')

In [11]:
# Create the analyzer class from SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Analyze a sample sentence
review = 'this app is very awesome. i love it a lot!'
scores = analyzer.polarity_scores(review)
print(scores)


{'neg': 0.0, 'neu': 0.396, 'pos': 0.604, 'compound': 0.8797}


In [12]:
# 🎯 How to Use the compound Score
# The compound score is a normalized, weighted score between -1 and 1:

# Compound Score Range	Sentiment
# ≥ 0.5	               Positive
# > -0.5 and < 0.5	   Neutral
# ≤ -0.5	               Negative

In [13]:
review = 'this app is very bad. i hate it a lot!'
scores = analyzer.polarity_scores(review)
print(scores)
review = 'this app is normal. okey thanks'
scores = analyzer.polarity_scores(review)
print(scores)


{'neg': 0.573, 'neu': 0.427, 'pos': 0.0, 'compound': -0.8427}
{'neg': 0.0, 'neu': 0.633, 'pos': 0.367, 'compound': 0.4404}


In [14]:
review_data['Sentiment_Score'] = review_data['Translated_Review'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

In [15]:
sentiment_counts = review_data.pivot_table(
    index = 'App',
    columns = 'Sentiment',
    values = 'Translated_Review',
    aggfunc = 'count',
    fill_value = 0
).rename(columns = {
    'Positive':'Positive_Review',
    'Negative':'Negative_Review',
    'Neutral':'Neutral_Review'
})

In [16]:
sentiment_agg = review_data.groupby('App').agg({
    'Translated_Review':'count',
    'Sentiment_Polarity':'mean',
    'Sentiment_Subjectivity':'mean',
    'Sentiment_Score':'mean'
}).rename(columns = {
    'Translated_Review':'Total_Review',
    'Sentiment_Polarity':'Avg_Polarity',
    'Sentiment_Subjectivity':'Avg_Subjectivity',
    'Sentiment_Score':'Avg_Sentiment_Score'
})

In [17]:
clean_review_data = pd.merge(sentiment_agg, sentiment_counts, left_index = True, right_index = True).reset_index()

In [18]:
output_file_path = "D:/Google Play Store Intern/Clean User Reviews.csv"
clean_review_data.to_csv(output_file_path, index = False)