In [1]:
# Step 1: Importing Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go   
import plotly.io as pio
from plotly.subplots import make_subplots
from datetime import datetime
import pytz
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import webbrowser
import os

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\souja\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
##Create a dual-axis chart comparing the average installs and revenue for free vs. paid apps within the top 3 app categories. Apply filters to exclude
#apps with fewer than 10,000 installs and revenue below $10,000 and android version should be more than 4.0 as well as size should be more than 15M 
#and content rating should be Everyone and app name should not have more than 30 characters including space and special character .this graph should
#work only between 1 PM IST to 2 PM IST apart from that time we should not show this graph in dashboard itself.

In [4]:
apps_df=pd.read_csv('Play Store Data.csv')
reviews_df=pd.read_csv('User Reviews.csv')

In [5]:
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')

In [6]:
merged_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.250,1.000000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.000,0.000000
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,,,,
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.500,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122657,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,
122658,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,
122659,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,
122660,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,


In [7]:
#Step 2: Data Cleaning

In [8]:
merged_df['Price'] = merged_df['Price'].replace(r'[\$,]', '', regex=True).astype(float)

In [9]:
merged_df['Installs'] = (
    merged_df['Installs'].astype(str)
    .str.replace(',', '', regex=False)
    .str.replace('+', '', regex=False)
    .astype(int)
)

In [10]:
merged_df['Android Ver'] = (
    merged_df['Android Ver'].astype(str)
    .str.extract(r'(\d+\.?\d*)')   # extract first number like 4.0 or 4.1.2
    .astype(float)
)

In [11]:
def convert_size(size):
    if pd.isna(size):
        return np.nan
    size = str(size).strip()
    if 'M' in size:         # size in MB
        return float(size.replace('M', ''))
    elif 'k' in size:       # size in KB
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan

merged_df['Size'] = merged_df['Size'].apply(convert_size)

In [12]:
merged_df['Revenue'] = merged_df['Price'] * merged_df['Installs']


In [13]:
merged_df = merged_df[
    (merged_df['Installs'] >= 10000) &
    (
        ((merged_df['Type'] == 'Paid') & (merged_df['Revenue'] >= 10000)) |
        (merged_df['Type'] == 'Free')
    ) &
    (merged_df['Android Ver'] > 4.0) &
    (merged_df['Size'] > 15) &
    (merged_df['Content Rating'] == 'Everyone') &
    (merged_df['App'].str.len() <= 30)
].copy()

In [14]:
#Step 3:Data Transmission

In [15]:
grouped_df = merged_df.groupby(['Category', 'Type']).agg(
    Avg_Installs=('Installs', 'mean'),
    Avg_Revenue=('Revenue', 'mean')
).reset_index()

In [16]:
grouped_df['Avg_Installs'] = grouped_df['Avg_Installs'].round(0)
grouped_df['Avg_Revenue'] = grouped_df['Avg_Revenue'].round(2)


In [17]:
top_categories = (
    grouped_df.groupby('Category')['Avg_Installs'].sum()
    .sort_values(ascending=False)
    .head(3)
    .index
)

In [18]:
top3 = grouped_df[grouped_df['Category'].isin(top_categories)].copy()

print(top3)
print(top3['Type'].value_counts())

   Category  Type  Avg_Installs  Avg_Revenue
8    FAMILY  Free    68362185.0          0.0
11     GAME  Free   165061728.0          0.0
24   SPORTS  Free    20594595.0          0.0
25   SPORTS  Paid       50000.0    1499500.0
Type
Free    3
Paid    1
Name: count, dtype: int64


In [19]:
#Step 4:Sentimental Analysis 

In [20]:
sia = SentimentIntensityAnalyzer()

In [21]:
merged_df['Sentiment_Score'] = merged_df['Translated_Review'].fillna('').apply(lambda x: sia.polarity_scores(x)['compound'])


In [22]:
def label_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

merged_df['Sentiment'] = merged_df['Sentiment_Score'].apply(label_sentiment)


In [23]:
sentiment_summary = merged_df.groupby(['Category', 'Type']).agg(
    Avg_Sentiment=('Sentiment_Score', 'mean')
).reset_index()

print(sentiment_summary.head())

              Category  Type  Avg_Sentiment
0    AUTO_AND_VEHICLES  Free       0.276250
1               BEAUTY  Free       0.000000
2  BOOKS_AND_REFERENCE  Free       0.294462
3             BUSINESS  Free       0.065426
4               COMICS  Free       0.000000


In [24]:
#Step 5:Plotting graph

In [25]:
html_files_paths="./"
if not os.path.exists(html_files_paths):
    os.makedirs(html_files_path)

In [26]:
plot_containers=""

In [27]:
def save_plot_as_html(fig,filename,insight):
    global plot_containers
    filepath= os.path.join(html_files_paths,filename)
    html_content=pio.to_html(fig,full_html=False,include_plotlyjs='inline')
    plot_containers+= f"""
    <div class ="plot-container" id="{filename}" onclick ="openPlot('{filename}')">
       <div class="plot">{html_content}</div> 
       <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath,full_html=False,include_plotlyjs="inline")


In [28]:
pivot_df = top3.pivot(index='Category', columns='Type', values=['Avg_Installs','Avg_Revenue']).fillna(0)
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
pivot_df = pivot_df.reset_index()

In [29]:
ist = pytz.timezone("Asia/Kolkata")
now = datetime.now(ist)

if 13 <= now.hour < 14:

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(go.Bar(x=pivot_df['Category'], y=pivot_df['Avg_Installs_Free'], name='Free Installs', marker_color='blue'), secondary_y=False)
    fig.add_trace(go.Bar(x=pivot_df['Category'], y=pivot_df['Avg_Installs_Paid'], name='Paid Installs', marker_color='green'), secondary_y=False)

    fig.add_trace(go.Scatter(x=pivot_df['Category'], y=pivot_df['Avg_Revenue_Free'], mode='lines+markers', name='Free Revenue', marker=dict(color='yellow', size=10)), secondary_y=True)
    fig.add_trace(go.Scatter(x=pivot_df['Category'], y=pivot_df['Avg_Revenue_Paid'], mode='lines+markers', name='Paid Revenue', marker=dict(color='red', size=10)), secondary_y=True)

    fig.update_layout(
        title="Average Installs vs Revenue (Free vs Paid) - Top 3 Categories",
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white',
        barmode='group'
    )
    fig.update_yaxes(title_text="Average Installs", type='log', secondary_y=False)
    fig.update_yaxes(title_text="Average Revenue", secondary_y=True)

    fig.show()

else:
    print("Graph is only available between 1 PM and 2 PM IST.")


Graph is only available between 1 PM and 2 PM IST.


In [30]:
plot_containers_split=plot_containers.split('</div')

In [31]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [32]:
dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
     <meta charset="UTF-8">
     <meta name=viewport" content="width=device-width,initial-scale-1.0">
     <title>Google Play Store Review Analytics</title>
     <style>
        body {{ 
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
         }}
         .header{{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img{{
            margin: 0 10px;
            height: 50px;
        }}
        .container{{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container{{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights{{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights{{ 
            display: block;
        }}
        </style>
        <script>
             function openPlot(filename){{
                 window.open(filename,'_blank');
                 }}
        </script>
    </head>
    <body>
        <div class= "header">
            <img src="https://www.keyweo.com/wp-content/uploads/2022/04/google-logo-history.jpg" alt="Google logo">
            <h1>Google Play Store Review Analytics</h1>
            <img src="https://www.sociocs.com/images/badge-google-play.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
    """

In [33]:
plot_width=400
plot_height=300
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}


In [34]:
final_html=dashboard_html.format(plots=plot_containers, plot_width=plot_width, plot_height=plot_height)

In [35]:
dashboard_path=os.path.join(html_files_paths,"web page.html")

In [36]:
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [37]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

True