## Libraries and Data Loading

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import nltk
import os

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
#Loading Data
apps_df = pd.read_csv("../Training/Datasets/Play Store Data.csv")
review_df = pd.read_csv("../Training/Datasets/User Reviews.csv")

## Data wrangling of apps_df

In [3]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
apps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
apps_df.isna().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [6]:
apps_df = apps_df.dropna(subset='Rating')

In [7]:
for col in apps_df.columns:
    apps_df[col].fillna(apps_df[col].mode()[0], inplace=True)

apps_df.drop_duplicates(inplace=True)

apps_df=apps_df[apps_df['Rating'] <= 5]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[col].fillna(apps_df[col].mode()[0], inplace=True)


In [8]:
apps_df['Rating'] = apps_df['Rating'].astype(float) # converts rating to float

In [9]:
apps_df['Installs'] = apps_df['Installs'].str.replace('+', '').str.replace(',', '').astype(int) # converts installs to int by removing +, these will be the minimum number of installs.

In [10]:
apps_df['Price'] = apps_df['Price'].str.replace('$', '').astype(float) # converts price to float

In [11]:
apps_df['Reviews'] = apps_df['Reviews'].astype(int) # converts reviews to int

In [12]:
apps_df["Log Installs"] = np.log(apps_df["Installs"])
apps_df['Log Reviews'] = np.log(apps_df['Reviews'])

In [13]:
def convert_size(size):
    if "M" in size:
        return float(size.replace("M", "").replace("m", ""))
    elif "K" in size:
        return float(size.replace("K", "").replace("k", "")) / 1024
    else:
        return np.nan
    
apps_df['Size'] = apps_df['Size'].apply(convert_size)

In [14]:
apps_df = apps_df.convert_dtypes() # automatically converts data types

In [15]:
apps_df["Last Updated"] = pd.to_datetime(apps_df["Last Updated"], errors="coerce")

In [16]:
apps_df["Year"] = apps_df["Last Updated"].dt.year

In [17]:
apps_df.drop_duplicates()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.21034,5.068904,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.214608,1.94591,2017
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517193,3.637586,2017
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.60517,1.386294,2018
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.907755,4.736198,2015


In [18]:
apps_df.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price,Last Updated,Log Installs,Log Reviews,Year
count,8892.0,8892.0,7167.0,8892.0,8892.0,8892,8892.0,8892.0,8892.0
mean,4.187877,472776.367184,23.547956,16489648.148673,0.963155,2017-11-21 21:09:28.421052416,12.179472,8.234893,2017.401484
min,1.0,1.0,1.0,1.0,0.0,2010-05-21 00:00:00,0.0,0.0,2010.0
25%,4.0,164.0,5.7,10000.0,0.0,2017-09-21 00:00:00,9.21034,5.099866,2017.0
50%,4.3,4714.5,15.0,500000.0,0.0,2018-05-28 00:00:00,13.122363,8.458398,2018.0
75%,4.5,71266.75,34.0,5000000.0,0.0,2018-07-23 00:00:00,15.424948,11.174185,2018.0
max,5.0,78158306.0,100.0,1000000000.0,400.0,2018-08-08 00:00:00,20.723266,18.174247,2018.0
std,0.522377,2905051.723592,23.460103,86376000.190279,16.189341,,3.837372,3.8802,1.116673


In [19]:
apps_df.isna().sum()

App                  0
Category             0
Rating               0
Reviews              0
Size              1725
Installs             0
Type                 0
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          0
Android Ver          0
Log Installs         0
Log Reviews          0
Year                 0
dtype: int64

In [20]:
apps_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8892 entries, 0 to 10840
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             8892 non-null   string        
 1   Category        8892 non-null   string        
 2   Rating          8892 non-null   Float64       
 3   Reviews         8892 non-null   Int64         
 4   Size            7167 non-null   Float64       
 5   Installs        8892 non-null   Int64         
 6   Type            8892 non-null   string        
 7   Price           8892 non-null   Float64       
 8   Content Rating  8892 non-null   string        
 9   Genres          8892 non-null   string        
 10  Last Updated    8892 non-null   datetime64[ns]
 11  Current Ver     8892 non-null   string        
 12  Android Ver     8892 non-null   string        
 13  Log Installs    8892 non-null   Float64       
 14  Log Reviews     8892 non-null   Float64       
 15  Year    

## Data Wrangling of reviews_df

In [21]:
review_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [22]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [23]:
review_df.isna().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

In [24]:
review_df = review_df.dropna()

In [25]:
review_df = review_df.convert_dtypes()

In [26]:
review_df.drop_duplicates()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3
...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225,0.447222
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.2875,0.25
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.8,1.0


In [27]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37427 entries, 0 to 64230
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     37427 non-null  string 
 1   Translated_Review       37427 non-null  string 
 2   Sentiment               37427 non-null  string 
 3   Sentiment_Polarity      37427 non-null  Float64
 4   Sentiment_Subjectivity  37427 non-null  Float64
dtypes: Float64(2), string(3)
memory usage: 1.8 MB


In [28]:
review_df.isna().sum()

App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

## Sentiment Analysis

In [29]:
sia = SentimentIntensityAnalyzer()

In [30]:
review_df["Sentiment_Score"] = review_df["Translated_Review"].apply(lambda x: sia.polarity_scores(x)["compound"])
review_df

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369
...,...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,-0.6486
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225,0.447222,0.7430
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.2875,0.25,-0.7269
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.8,1.0,0.7783


In [31]:
# For more accurate Sentiments

def label_sentiments(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"
    
review_df["Custom_Sentiment"] = review_df["Sentiment_Score"].apply(label_sentiments)
review_df

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score,Custom_Sentiment
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531,Positive
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597,Positive
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249,Positive
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369,Positive
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369,Positive
...,...,...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,-0.6486,Negative
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225,0.447222,0.7430,Positive
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.2875,0.25,-0.7269,Negative
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.8,1.0,0.7783,Positive


# Internship Tasks

## Task 1: Visualize the sentiment distribution (positive, neutral, negative) of user reviews using a stacked bar chart, segmented by rating groups (e.g., 1-2 stars, 3-4 stars, 4-5 stars). Include only apps with more than 1,000 reviews and group by the top 5 categories.

In [32]:
apps_df1 = apps_df.copy(deep=True)
review_df1 = review_df.copy(deep=True)

In [33]:
# Rating groups function - 1-2 stars, 2-3 stars, 3-4 stars, 4-5 stars.
def rating_group(rating):
    if 1 <= rating < 2:
        return "1-2 Stars"
    elif 2 <= rating < 3:
        return "2-3 Stars"
    elif 3 <= rating < 4:
        return "3-4 Stars"
    elif 4 <= rating <= 5:
        return "4-5 Stars"
    else:
        return None
    
apps_df1['Rating_group'] = apps_df1['Rating'].apply(rating_group)
apps_df1

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews,Year,Rating_group
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.21034,5.068904,2018,4-5 Stars
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,2018,3-4 Stars
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,2018,4-5 Stars
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,2018,4-5 Stars
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,2018,4-5 Stars
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.214608,1.94591,2017,4-5 Stars
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517193,3.637586,2017,4-5 Stars
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.60517,1.386294,2018,4-5 Stars
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.907755,4.736198,2015,4-5 Stars


In [34]:
merged_df = pd.merge(apps_df1, review_df1, on="App", how="inner")
merged_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,...,Log Installs,Log Reviews,Year,Rating_group,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score,Custom_Sentiment
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,13.122363,6.874198,2018,3-4 Stars,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0,-0.2500,Negative
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,13.122363,6.874198,2018,3-4 Stars,It bad >:(,Negative,-0.725,0.833333,-0.8020,Negative
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,13.122363,6.874198,2018,3-4 Stars,like,Neutral,0.0,0.0,0.3612,Positive
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,13.122363,6.874198,2018,3-4 Stars,I love colors inspyering,Positive,0.5,0.6,0.6369,Positive
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,13.122363,6.874198,2018,3-4 Stars,I hate,Negative,-0.8,0.9,-0.5719,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59119,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,...,13.815511,10.51816,2018,4-5 Stars,Nice broser slow browsing speed... make 8mbps ...,Positive,0.1,0.492308,0.6908,Positive
59120,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,...,13.815511,10.51816,2018,4-5 Stars,The thing I found missing simple bookmark draw...,Positive,0.225,0.426786,0.3612,Positive
59121,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,...,13.815511,10.51816,2018,4-5 Stars,Great Relief unwanted pop ups showing up. What...,Positive,0.65,0.625,0.7430,Positive
59122,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,...,13.815511,10.51816,2018,4-5 Stars,Hoped found new go-to; LOVE Firefox PC. Aside ...,Positive,0.345455,0.484848,0.7456,Positive


In [35]:
top_5_df = apps_df1.groupby('Category')['Reviews'].sum().nlargest(5)
top_5_df

Category
GAME             1415533878
COMMUNICATION     601273091
SOCIAL            533576498
FAMILY            396768720
TOOLS             273184168
Name: Reviews, dtype: Int64

In [36]:
fig1 = px.bar(
    top_5_df,
    x = top_5_df.index,
    y = top_5_df.values,
    title = "Top 5 Categories by Total Reviews",
)

pio.show(fig1)

In [37]:
# Filter for apps with more than 1,000 reviews and in top 5 categories
top_categories1 = top_5_df.index.tolist()
filtered_apps1 = apps_df1[(apps_df1['Reviews'] > 1000) & (apps_df1['Category'].isin(top_categories1))]

filtered_apps1

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews,Year,Rating_group
335,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56642847,,1000000000,Free,0.0,Everyone,Communication,2018-08-01,Varies with device,Varies with device,20.723266,17.852276,2018,4-5 Stars
336,WhatsApp Messenger,COMMUNICATION,4.4,69119316,,1000000000,Free,0.0,Everyone,Communication,2018-08-03,Varies with device,Varies with device,20.723266,18.051345,2018,4-5 Stars
337,Messenger for SMS,COMMUNICATION,4.3,125257,17.0,10000000,Free,0.0,Teen,Communication,2018-06-06,1.8.9,4.1 and up,16.118096,11.738123,2018,4-5 Stars
338,Google Chrome: Fast & Secure,COMMUNICATION,4.3,9642995,,1000000000,Free,0.0,Everyone,Communication,2018-08-01,Varies with device,Varies with device,20.723266,16.081742,2018,4-5 Stars
339,Messenger Lite: Free Calls & Messages,COMMUNICATION,4.4,1429035,,100000000,Free,0.0,Everyone,Communication,2018-07-25,37.0.0.7.163,2.3 and up,18.420681,14.17251,2018,4-5 Stars
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10799,Fr Daoud Lamei,SOCIAL,4.7,2036,6.8,100000,Free,0.0,Everyone,Social,2018-05-20,1.72,4.0.3 and up,11.512925,7.618742,2018,4-5 Stars
10803,Fatal Raid - No.1 Mobile FPS,GAME,4.3,56496,81.0,1000000,Free,0.0,Teen,Action,2018-08-07,1.5.447,4.0 and up,13.815511,10.941925,2018,4-5 Stars
10804,Poker Pro.Fr,GAME,4.2,5442,17.0,100000,Free,0.0,Teen,Card,2018-05-22,4.1.3,2.3 and up,11.512925,8.601902,2018,4-5 Stars
10809,Castle Clash: RPG War and Strategy FR,FAMILY,4.7,376223,24.0,1000000,Free,0.0,Everyone,Strategy,2018-07-18,1.4.2,4.1 and up,13.815511,12.837937,2018,4-5 Stars


In [38]:
# Merge with review data
filtered_merged1 = pd.merge(filtered_apps1[['App', 'Category', 'Rating_group']], review_df1, on='App', how='inner')
filtered_merged1

Unnamed: 0,App,Category,Rating_group,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score,Custom_Sentiment
0,Gmail,COMMUNICATION,4-5 Stars,gets crashed I try open email clicking notific...,Neutral,0.0,0.5,-0.4019,Negative
1,Gmail,COMMUNICATION,4-5 Stars,Trying delete mails.. delete button top drive ...,Positive,0.309524,0.571429,0.5994,Positive
2,Gmail,COMMUNICATION,4-5 Stars,The folder creation option. It useful could ma...,Positive,0.233939,0.324242,0.8360,Positive
3,Gmail,COMMUNICATION,4-5 Stars,Long time user G-mail. Has always worked extre...,Positive,0.400998,0.664526,0.9617,Positive
4,Gmail,COMMUNICATION,4-5 Stars,The frustrating part replying often defaults G...,Positive,0.3,0.95,0.3182,Positive
...,...,...,...,...,...,...,...,...,...
27273,Firefox Focus: The privacy browser,COMMUNICATION,4-5 Stars,Nice broser slow browsing speed... make 8mbps ...,Positive,0.1,0.492308,0.6908,Positive
27274,Firefox Focus: The privacy browser,COMMUNICATION,4-5 Stars,The thing I found missing simple bookmark draw...,Positive,0.225,0.426786,0.3612,Positive
27275,Firefox Focus: The privacy browser,COMMUNICATION,4-5 Stars,Great Relief unwanted pop ups showing up. What...,Positive,0.65,0.625,0.7430,Positive
27276,Firefox Focus: The privacy browser,COMMUNICATION,4-5 Stars,Hoped found new go-to; LOVE Firefox PC. Aside ...,Positive,0.345455,0.484848,0.7456,Positive


Using inner join to merge the filtered apps with the reviews df, we only get apps with rating 3.0 or higher.

In [39]:
# Group by Category, Rating_group, and Custom_Sentiment
sentiment_dist = (
    filtered_merged1
    .groupby(['Category', 'Rating_group', 'Custom_Sentiment'])
    .size()
    .reset_index(name='Count')
)
sentiment_dist

Unnamed: 0,Category,Rating_group,Custom_Sentiment,Count
0,COMMUNICATION,3-4 Stars,Negative,26
1,COMMUNICATION,3-4 Stars,Neutral,7
2,COMMUNICATION,3-4 Stars,Positive,32
3,COMMUNICATION,4-5 Stars,Negative,343
4,COMMUNICATION,4-5 Stars,Neutral,248
5,COMMUNICATION,4-5 Stars,Positive,1154
6,FAMILY,3-4 Stars,Negative,59
7,FAMILY,3-4 Stars,Neutral,25
8,FAMILY,3-4 Stars,Positive,99
9,FAMILY,4-5 Stars,Negative,803


In [40]:
# Pivot for stacked bar chart
sentiment_pivot = sentiment_dist.pivot_table(
    index=['Category', 'Rating_group'],
    columns='Custom_Sentiment',
    values='Count',
    fill_value=0
).reset_index()

sentiment_pivot

Custom_Sentiment,Category,Rating_group,Negative,Neutral,Positive
0,COMMUNICATION,3-4 Stars,26.0,7.0,32.0
1,COMMUNICATION,4-5 Stars,343.0,248.0,1154.0
2,FAMILY,3-4 Stars,59.0,25.0,99.0
3,FAMILY,4-5 Stars,803.0,392.0,3975.0
4,GAME,3-4 Stars,18.0,9.0,95.0
5,GAME,4-5 Stars,4019.0,666.0,12379.0
6,SOCIAL,3-4 Stars,53.0,29.0,106.0
7,SOCIAL,4-5 Stars,326.0,131.0,490.0
8,TOOLS,3-4 Stars,53.0,55.0,129.0
9,TOOLS,4-5 Stars,303.0,215.0,1039.0


In [41]:
# Plotly stacked bar chart
fig2 = px.bar(
    sentiment_pivot,
    x='Rating_group',
    y=['Positive', 'Neutral', 'Negative'],
    color_discrete_map={'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'},
    facet_col='Category',
    title='Sentiment Distribution by Rating Group (Top 5 Categories, >1,000 Reviews)',
    labels={'value': 'Review Count', 'Rating_group': 'Rating Group'},
    barmode='stack'
)
pio.show(fig2)

## Task 2: Create an interactive Choropleth map using Plotly to visualize global installs by Category. Apply filters to show data for only the top 5 app categories and highlight category where the number of installs exceeds 1 million. The app category should not start with the characters “A,” “C,” “G,” or “S.” This graph should work only between 6 PM IST and 8 PM IST; apart from that time, we should not show it in the dashboard itself.

In [42]:
apps_df2 = apps_df.copy(deep=True)
review_df2 = review_df.copy(deep=True)

In [43]:
from datetime import datetime
import pytz

In [44]:
# Get current time in IST
ist = pytz.timezone('Asia/Kolkata')
now_ist = datetime.now(ist)
start_time = now_ist.replace(hour=18, minute=0, second=0, microsecond=0)
end_time = now_ist.replace(hour=20, minute=0, second=0, microsecond=0)

#### Creating Top 5 Categories first and removing apps starting with 'A', 'C', 'G', 'S' second.

In [45]:
top_5_df = apps_df2.groupby('Category')['Reviews'].sum().nlargest(5)
top_5_df

Category
GAME             1415533878
COMMUNICATION     601273091
SOCIAL            533576498
FAMILY            396768720
TOOLS             273184168
Name: Reviews, dtype: Int64

In [46]:
# filtering apps based on the top 5 categories
# and excluding categories starting with A, C, G, S
top_categories = top_5_df.index.tolist()

filtered_cats2 = [cat for cat in top_categories if not cat.startswith(('A', 'C', 'G', 'S'))]
filtered_df2 = apps_df2[apps_df2['Category'].isin(filtered_cats2)]
filtered_df2

# This removes categories starting with A, C, G, S from the top 5 categories. The output results in a DataFrame with apps from the 2 categories remaining.

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews,Year
2014,YouTube Kids,FAMILY,4.5,470694,,50000000,Free,0.0,Everyone,Entertainment;Music & Video,2018-08-03,3.43.3,4.1 and up,17.727534,13.061963,2018
2015,Candy Bomb,FAMILY,4.4,42145,20.0,10000000,Free,0.0,Everyone,Casual;Brain Games,2018-07-04,2.9.3181,4.0.3 and up,16.118096,10.648871,2018
2016,ROBLOX,FAMILY,4.5,4449910,67.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up,18.420681,15.308394,2018
2017,Jewels Crush- Match 3 Puzzle,FAMILY,4.4,14774,19.0,1000000,Free,0.0,Everyone,Casual;Brain Games,2018-07-23,1.9.3901,4.0.3 and up,13.815511,9.600624,2018
2018,Coloring & Learn,FAMILY,4.4,12753,51.0,5000000,Free,0.0,Everyone,Educational;Creativity,2018-07-17,1.49,4.0.3 and up,15.424948,9.453522,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10820,Fr. Daoud Lamei,FAMILY,5.0,22,8.6,1000,Free,0.0,Teen,Education,2018-06-27,3.8.0,4.1 and up,6.907755,3.091042,2018
10827,Fr Agnel Ambarnath,FAMILY,4.2,117,13.0,5000,Free,0.0,Everyone,Education,2018-06-13,2.0.20,4.0.3 and up,8.517193,4.762174,2018
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.214608,1.94591,2017
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517193,3.637586,2017


In [47]:
# Aggregate installs by category
installs_by_cat2 = filtered_df2.groupby('Category')['Installs'].sum().reset_index()
installs_by_cat2

Unnamed: 0,Category,Installs
0,FAMILY,10041130590
1,TOOLS,11450724500


In [48]:
# Highlight categories with installs > 1 million
installs_by_cat2['Highlight'] = installs_by_cat2['Installs'] > 1_000_000

installs_by_cat2

Unnamed: 0,Category,Installs,Highlight
0,FAMILY,10041130590,True
1,TOOLS,11450724500,True


In [49]:
# Since we don't have country info, use a bar chart as a proxy for choropleth
fig = px.bar(
    installs_by_cat2,
    x='Category',
    y='Installs',
    color='Highlight',
    color_discrete_map={True: 'crimson', False: 'steelblue'},
    title='Global Installs by Category (Filtered, Highlight >1M Installs)',
    labels={'Installs': 'Total Installs', 'Category': 'App Category'}
)
pio.show(fig)

In [50]:
if start_time <= now_ist <= end_time:
    fig = px.bar(
        installs_by_cat2,
        x='Category',
        y='Installs',
        color='Highlight',
        color_discrete_map={True: 'crimson', False: 'steelblue'},
        title='Global Installs by Category (Filtered, Highlight >1M Installs)',
        labels={'Installs': 'Total Installs', 'Category': 'App Category'}
    )
    pio.show(fig)
else:
    print("Choropleth map is only available between 6 PM and 8 PM IST.")

Choropleth map is only available between 6 PM and 8 PM IST.


In [51]:
# Randomly assigning countries to the installs_by_cat DataFrame
import random

# List of some country ISO Alpha-3 codes for demonstration
country_codes = [
    'USA', 'IND', 'BRA', 'RUS', 'CHN', 'FRA', 'DEU', 'GBR', 'AUS', 'CAN',
    'JPN', 'KOR', 'ITA', 'ESP', 'TUR', 'MEX', 'IDN', 'SAU', 'ZAF', 'EGY'
]

# Randomly assign a country to each row in installs_by_cat
installs_by_cat2['Country'] = random.choices(country_codes, k=len(installs_by_cat2))

installs_by_cat2

Unnamed: 0,Category,Installs,Highlight,Country
0,FAMILY,10041130590,True,USA
1,TOOLS,11450724500,True,CHN


In [52]:
# Plotly choropleth map

fig_choropleth = px.choropleth(
    installs_by_cat2,
    locations='Country',
    color='Installs',
    hover_name='Category',
    color_continuous_scale='thermal',
    title='Mock Global Installs by Category (Random Countries)',
    labels={'Installs': 'Total Installs'}
)
pio.show(fig_choropleth)


In [53]:
# Plotly choropleth map with time constraints
if start_time <= now_ist <= end_time:
    fig_choropleth = px.choropleth(
        installs_by_cat2,
        locations='Country',
        color='Installs',
        hover_name='Category',
        color_continuous_scale='thermal',
        title='Mock Global Installs by Category (Random Countries)',
        labels={'Installs': 'Total Installs'}
    )
    pio.show(fig_choropleth)

else:
    print("Choropleth map is only available between 6 PM and 8 PM IST.")

Choropleth map is only available between 6 PM and 8 PM IST.


#### Removing categoires with 'A', 'C', 'G', 'S' first, then selecting the top 5 categoires.

In [54]:
cats_df = apps_df2.groupby('Category')['Reviews'].sum()
cats_df

Category
ART_AND_DESIGN            1714372
AUTO_AND_VEHICLES         1163630
BEAUTY                     395133
BOOKS_AND_REFERENCE      21872818
BUSINESS                 12357705
COMICS                    3381945
COMMUNICATION           601273091
DATING                    5545164
EDUCATION                23164724
ENTERTAINMENT            47570716
EVENTS                     160590
FAMILY                  396768720
FINANCE                  16999569
FOOD_AND_DRINK            7671255
GAME                   1415533878
HEALTH_AND_FITNESS       30845065
HOUSE_AND_HOME            2794391
LIBRARIES_AND_DEMO        1033535
LIFESTYLE                12819560
MAPS_AND_NAVIGATION      30659060
MEDICAL                   1396427
NEWS_AND_MAGAZINES       38244909
PARENTING                  953609
PERSONALIZATION          75192838
PHOTOGRAPHY             204297367
PRODUCTIVITY            102554240
SHOPPING                 94930973
SOCIAL                  533576498
SPORTS                   65322107
TOOLS

In [55]:
# filtering apps based on the top 5 categories
# and excluding categories starting with A, C, G, S
top_categories = cats_df.index.tolist()

filtered_cats3 = [cat for cat in top_categories if not cat.startswith(('A', 'C', 'G', 'S'))]
filtered_df3 = apps_df2[apps_df2['Category'].isin(filtered_cats3)]
filtered_df3

# This removes categories starting with A, C, G, S from the top 5 categories. The output results in a DataFrame with apps from the 2 categories remaining.

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews,Year
98,Hush - Beauty for Everyone,BEAUTY,4.7,18900,17.0,500000,Free,0.0,Everyone,Beauty,2018-08-02,6.10.1,5.0 and up,13.122363,9.846917,2018
99,"ipsy: Makeup, Beauty, and Tips",BEAUTY,4.9,49790,14.0,1000000,Free,0.0,Everyone,Beauty,2017-11-09,2.3.0,4.1 and up,13.815511,10.815569,2017
100,Natural recipes for your beauty,BEAUTY,4.7,1150,9.8,100000,Free,0.0,Everyone,Beauty,2018-05-15,4.0,4.1 and up,11.512925,7.047517,2018
101,"BestCam Selfie-selfie, beauty camera, photo ed...",BEAUTY,3.9,1739,21.0,500000,Free,0.0,Everyone,Beauty,2018-07-12,1.0.6,4.0.3 and up,13.122363,7.461066,2018
102,Mirror - Zoom & Exposure -,BEAUTY,3.9,32090,,1000000,Free,0.0,Everyone,Beauty,2016-10-24,Varies with device,Varies with device,13.815511,10.3763,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.214608,1.94591,2017
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517193,3.637586,2017
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.60517,1.386294,2018
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.907755,4.736198,2015


In [56]:
top_5_df_filtered = filtered_df3.groupby('Category')['Reviews'].sum().nlargest(5)
top_5_df_filtered

Category
FAMILY           396768720
TOOLS            273184168
PHOTOGRAPHY      204297367
VIDEO_PLAYERS    110379692
PRODUCTIVITY     102554240
Name: Reviews, dtype: Int64

In [57]:
# Aggregate installs by category
installs_by_cat3 = filtered_df3.groupby('Category')['Installs'].sum().reset_index()
installs_by_cat3

Unnamed: 0,Category,Installs
0,BEAUTY,26916200
1,BOOKS_AND_REFERENCE,1916291655
2,BUSINESS,863518120
3,DATING,206522410
4,EDUCATION,533852000
5,ENTERTAINMENT,2455660000
6,EVENTS,15949410
7,FAMILY,10041130590
8,FINANCE,770312400
9,FOOD_AND_DRINK,257777750


In [58]:
# Highlight categories with installs > 1 million
installs_by_cat3['Highlight'] = installs_by_cat3['Installs'] > 1_000_000

top_5_installs_by_cat3 = installs_by_cat3.nlargest(5, 'Installs')
top_5_installs_by_cat3

Unnamed: 0,Category,Installs,Highlight
20,PRODUCTIVITY,12463070180,True
21,TOOLS,11450724500,True
7,FAMILY,10041130590,True
19,PHOTOGRAPHY,9721243130,True
22,TRAVEL_AND_LOCAL,6361859300,True


In [59]:
# Since we don't have country info, use a bar chart as a proxy for choropleth
fig = px.bar(
    top_5_installs_by_cat3,
    x='Category',
    y='Installs',
    color='Highlight',
    color_discrete_map={True: 'crimson', False: 'steelblue'},
    title='Global Installs by Category (Filtered, Highlight >1M Installs)',
    labels={'Installs': 'Total Installs', 'Category': 'App Category'}
)
pio.show(fig)

In [60]:
# Randomly assigning countries to the installs_by_cat DataFrame
import random

# List of some country ISO Alpha-3 codes for demonstration
country_codes = [
    'USA', 'IND', 'BRA', 'RUS', 'CHN', 'FRA', 'DEU', 'GBR', 'AUS', 'CAN',
    'JPN', 'KOR', 'ITA', 'ESP', 'TUR', 'MEX', 'IDN', 'SAU', 'ZAF', 'EGY'
]

# Randomly assign a country to each row in installs_by_cat
top_5_installs_by_cat3['Country'] = random.choices(country_codes, k=len(top_5_installs_by_cat3))

top_5_installs_by_cat3

Unnamed: 0,Category,Installs,Highlight,Country
20,PRODUCTIVITY,12463070180,True,DEU
21,TOOLS,11450724500,True,IDN
7,FAMILY,10041130590,True,USA
19,PHOTOGRAPHY,9721243130,True,DEU
22,TRAVEL_AND_LOCAL,6361859300,True,ESP


In [61]:
# Plotly choropleth map
fig_choropleth = px.choropleth(
    top_5_installs_by_cat3,
    locations='Country',
    color='Installs',
    hover_name='Category',
    color_continuous_scale='thermal',
    title='Mock Global Installs by Category (Random Countries)',
    labels={'Installs': 'Total Installs'}
)
pio.show(fig_choropleth)

In [62]:
# Plotly choropleth map with time constraints
if start_time <= now_ist <= end_time:
    fig_choropleth = px.choropleth(
        top_5_installs_by_cat3,
        locations='Country',
        color='Installs',
        hover_name='Category',
        color_continuous_scale='thermal',
        title='Mock Global Installs by Category (Random Countries)',
        labels={'Installs': 'Total Installs'}
    )
    pio.show(fig_choropleth)

else:
    print("Choropleth map is only available between 6 PM and 8 PM IST.")

Choropleth map is only available between 6 PM and 8 PM IST.


## Task 3: Plot a bubble chart to analyze the relationship between app size (in MB) and average rating, with the bubble size representing the number of installs. Include a filter to show only apps with a rating higher than 3.5 and that belong to the Game, Beauty ,business , commics , commication , Dating , Entertainment , social and event categories. Reviews should be greater than 500 and the app name should not contain letter "S" and sentiment subjectivity should be more than 0.5 and highlight the Game Category chart in Pink color. We have to translate the Beauty category in Hindi and Business category in Tamil and Dating category in German while showing it on Graphs. Installs should be more than 50k as well as this graph should work only between 5 PM IST to 7 PM IST apart from that time we should not show this graph in dashboard itself.

In [63]:
apps_df3 = apps_df.copy(deep=True)
review_df3 = review_df.copy(deep=True)

In [64]:
# Define the required categories (case-insensitive match)
categories = [
    "GAME", "BEAUTY", "BUSINESS", "COMICS", "COMMUNICATION",
    "DATING", "ENTERTAINMENT", "SOCIAL", "EVENTS"
]

In [65]:
# Prepare translation mapping for categories
category_translation = {
    "BEAUTY": "सौंदर्य",        # Hindi
    "BUSINESS": "வணிகம்",      # Tamil
    "DATING": "Datierung"    # German
}

In [66]:
# Merge apps_df with review_df to get sentiment subjectivity
merged_bubble = pd.merge(
    apps_df,
    review_df[['App', 'Sentiment_Subjectivity']],
    on='App',
    how='inner'
)
merged_bubble

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews,Year,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,2018,1.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,2018,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,2018,0.0
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,2018,0.6
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,2018,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59119,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,2018-07-06,5.2,5.0 and up,13.815511,10.51816,2018,0.492308
59120,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,2018-07-06,5.2,5.0 and up,13.815511,10.51816,2018,0.426786
59121,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,2018-07-06,5.2,5.0 and up,13.815511,10.51816,2018,0.625
59122,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0,1000000,Free,0.0,Everyone,Communication,2018-07-06,5.2,5.0 and up,13.815511,10.51816,2018,0.484848


In [67]:
# Apply all filters
filtered_bubble = merged_bubble[
    (merged_bubble['Rating'] > 3.5) &
    (merged_bubble['Category'].str.upper().isin(categories)) &
    (merged_bubble['Reviews'] > 500) &
    (~merged_bubble['App'].str.contains('s', case=False, na=False)) &
    (merged_bubble['Sentiment_Subjectivity'] > 0.5) &
    (merged_bubble['Installs'] > 50000)
].copy()

In [68]:
# Translate categories for plotting
def translate_category(cat):
    cat_upper = cat.upper()
    if cat_upper in category_translation:
        return category_translation[cat_upper]
    return cat

filtered_bubble['Category_Display'] = filtered_bubble['Category'].apply(translate_category)

In [69]:
# Set color: pink for Game, default for others
color_map = {
    "GAME": "pink"
}
filtered_bubble['Color'] = filtered_bubble['Category'].apply(
    lambda x: color_map.get(x.upper(), None)
)

In [70]:
filtered_bubble

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log Installs,Log Reviews,Year,Sentiment_Subjectivity,Category_Display,Color
1790,Google Primer,BUSINESS,4.4,62272,18.0,10000000,Free,0.0,Everyone,Business,2018-06-26,3.550.2,4.1 and up,16.118096,11.039267,2018,0.75,வணிகம்,
1791,Google Primer,BUSINESS,4.4,62272,18.0,10000000,Free,0.0,Everyone,Business,2018-06-26,3.550.2,4.1 and up,16.118096,11.039267,2018,0.6,வணிகம்,
1827,Box,BUSINESS,4.2,159872,,10000000,Free,0.0,Everyone,Business,2018-07-31,Varies with device,Varies with device,16.118096,11.982129,2018,0.758333,வணிகம்,
1830,Box,BUSINESS,4.2,159872,,10000000,Free,0.0,Everyone,Business,2018-07-31,Varies with device,Varies with device,16.118096,11.982129,2018,0.611111,வணிகம்,
1834,Box,BUSINESS,4.2,159872,,10000000,Free,0.0,Everyone,Business,2018-07-31,Varies with device,Varies with device,16.118096,11.982129,2018,0.616667,வணிகம்,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59023,Garena Free Fire,GAME,4.5,5534114,53.0,100000000,Free,0.0,Teen,Action,2018-08-03,1.21.0,4.0.3 and up,18.420681,15.526442,2018,0.716667,GAME,pink
59024,Garena Free Fire,GAME,4.5,5534114,53.0,100000000,Free,0.0,Teen,Action,2018-08-03,1.21.0,4.0.3 and up,18.420681,15.526442,2018,1.0,GAME,pink
59025,Garena Free Fire,GAME,4.5,5534114,53.0,100000000,Free,0.0,Teen,Action,2018-08-03,1.21.0,4.0.3 and up,18.420681,15.526442,2018,0.625,GAME,pink
59028,Garena Free Fire,GAME,4.5,5534114,53.0,100000000,Free,0.0,Teen,Action,2018-08-03,1.21.0,4.0.3 and up,18.420681,15.526442,2018,0.666667,GAME,pink


In [71]:
# Plot bubble chart

fig_bubble = px.scatter(
    filtered_bubble,
    x='Size',
    y='Rating',
    size='Installs',
    color='Category_Display',
    color_discrete_map={'GAME': 'pink', 'सौंदर्य': '#636efa', 'வணிகம்': '#ef553b', 'Datierung': '#00cc96'},
    hover_data=['App', 'Reviews', 'Installs'],
    title='App Size vs. Average Rating (Filtered, Bubble Size = Installs)',
    labels={'Size': 'App Size (MB)', 'Rating': 'Average Rating', 'Category_Display': 'Category',},
    size_max=500,
)

pio.show(fig_bubble)

In [72]:
# Define the time window for the chart (5 PM to 7 PM IST)
bubble_start_time = now_ist.replace(hour=17, minute=0, second=0, microsecond=0)
bubble_end_time = now_ist.replace(hour=19, minute=0, second=0, microsecond=0)

if bubble_start_time <= now_ist <= bubble_end_time:
    fig_bubble = px.scatter(
    filtered_bubble,
    x='Size',
    y='Rating',
    size='Installs',
    color='Category_Display',
    color_discrete_map={'GAME': 'pink', 'सौंदर्य': '#636efa', 'வணிகம்': '#ef553b', 'Datierung': '#00cc96'},
    hover_data=['App', 'Reviews', 'Installs'],
    title='App Size vs. Average Rating (Filtered, Bubble Size = Installs)',
    labels={'Size': 'App Size (MB)', 'Rating': 'Average Rating', 'Category_Display': 'Category',},
    size_max=500,
    )

    pio.show(fig_bubble)
    
else:
    print("Bubble chart is only available between 5 PM and 7 PM IST.")

Bubble chart is only available between 5 PM and 7 PM IST.
