### Libraries and Data Loading

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import nltk
import os

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
#Loading Data
apps_df = pd.read_csv("../Training/Datasets/Play Store Data.csv")
review_df = pd.read_csv("../Training/Datasets/User Reviews.csv")

### Data wrangling of apps_df

In [3]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
apps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
apps_df.isna().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [6]:
apps_df = apps_df.dropna(subset='Rating')

In [7]:
for col in apps_df.columns:
    apps_df[col].fillna(apps_df[col].mode()[0], inplace=True)

apps_df.drop_duplicates(inplace=True)

apps_df=apps_df[apps_df['Rating'] <= 5]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[col].fillna(apps_df[col].mode()[0], inplace=True)


In [8]:
apps_df['Rating'] = apps_df['Rating'].astype(float) # converts rating to float

In [9]:
apps_df['Installs'] = apps_df['Installs'].str.replace('+', '').str.replace(',', '').astype(int) # converts installs to int by removing +, these will be the minimum number of installs.

In [10]:
apps_df['Price'] = apps_df['Price'].str.replace('$', '').astype(float) # converts price to float

In [11]:
apps_df['Reviews'] = apps_df['Reviews'].astype(int) # converts reviews to int

In [12]:
apps_df["Log Installs"] = np.log(apps_df["Installs"])
apps_df['Log Reviews'] = np.log(apps_df['Reviews'])

In [13]:
def convert_size(size):
    if "M" in size:
        return float(size.replace("M", "").replace("m", ""))
    elif "K" in size:
        return float(size.replace("K", "").replace("k", "")) / 1024
    else:
        return np.nan
    
apps_df['Size'] = apps_df['Size'].apply(convert_size)

In [14]:
apps_df = apps_df.convert_dtypes() # automatically converts data types

In [15]:
apps_df.drop_duplicates()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.21034,5.068904
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122363,6.874198
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424948,11.379508
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534,12.281384
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512925,6.874198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.214608,1.94591
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517193,3.637586
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.60517,1.386294
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.907755,4.736198


In [16]:
apps_df.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price,Log Installs,Log Reviews
count,8892.0,8892.0,7167.0,8892.0,8892.0,8892.0,8892.0
mean,4.187877,472776.367184,23.547956,16489648.148673,0.963155,12.179472,8.234893
std,0.522377,2905051.723592,23.460103,86376000.190279,16.189341,3.837372,3.8802
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,4.0,164.0,5.7,10000.0,0.0,9.21034,5.099866
50%,4.3,4714.5,15.0,500000.0,0.0,13.122363,8.458398
75%,4.5,71266.75,34.0,5000000.0,0.0,15.424948,11.174185
max,5.0,78158306.0,100.0,1000000000.0,400.0,20.723266,18.174247


In [17]:
apps_df.isna().sum()

App                  0
Category             0
Rating               0
Reviews              0
Size              1725
Installs             0
Type                 0
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          0
Android Ver          0
Log Installs         0
Log Reviews          0
dtype: int64

In [18]:
apps_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8892 entries, 0 to 10840
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8892 non-null   string 
 1   Category        8892 non-null   string 
 2   Rating          8892 non-null   Float64
 3   Reviews         8892 non-null   Int64  
 4   Size            7167 non-null   Float64
 5   Installs        8892 non-null   Int64  
 6   Type            8892 non-null   string 
 7   Price           8892 non-null   Float64
 8   Content Rating  8892 non-null   string 
 9   Genres          8892 non-null   string 
 10  Last Updated    8892 non-null   string 
 11  Current Ver     8892 non-null   string 
 12  Android Ver     8892 non-null   string 
 13  Log Installs    8892 non-null   Float64
 14  Log Reviews     8892 non-null   Float64
dtypes: Float64(5), Int64(2), string(8)
memory usage: 1.1 MB


### Data Wrangling of reviews_df

In [19]:
review_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [20]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [21]:
review_df.isna().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

In [22]:
review_df = review_df.dropna()

In [23]:
review_df = review_df.convert_dtypes()

In [24]:
review_df.drop_duplicates()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3
...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225,0.447222
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.2875,0.25
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.8,1.0


In [25]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37427 entries, 0 to 64230
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     37427 non-null  string 
 1   Translated_Review       37427 non-null  string 
 2   Sentiment               37427 non-null  string 
 3   Sentiment_Polarity      37427 non-null  Float64
 4   Sentiment_Subjectivity  37427 non-null  Float64
dtypes: Float64(2), string(3)
memory usage: 1.8 MB


In [26]:
review_df.isna().sum()

App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64