In [1]:
#Step 1:Importing Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from datetime import datetime
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pytz
import nltk
import webbrowser
import os
import re                      

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\souja\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
#You are required to create a stacked area chart to visualize the cumulative number of installs over time for each app category, with each category 
#represented as a separate color band in the chart. Apply the following filters before plotting: include only apps with an average rating of at least
#4.2, app names that do not contain any numbers, app categories that start with the letter “T” or “P,” reviews greater than 1,000, and app sizes between
#20 MB and 80 MB. In the chart legend, translate “Travel & Local” into French, “Productivity” into Spanish, and “Photography” into Japanese. Highlight
#by increasing the color intensity for any month where total installs increased by more than 25% month-over-month for any category. This visualization
#must only be displayed between 4 PM IST and 6 PM IST, and it should not appear on the dashboard outside this time window.

In [4]:
#Step 2:Data Cleaning

In [5]:
apps_df=pd.read_csv('Play Store Data.csv')
reviews_df=pd.read_csv('User Reviews.csv')

In [6]:
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')

In [7]:
merged_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.250,1.000000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.000,0.000000
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,,,,
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.500,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122657,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,
122658,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,
122659,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,
122660,FP Notebook,MEDICAL,4.5,410,60M,"50,000+",Free,0,Everyone,Medical,"March 24, 2018",2.1.0.372,4.4 and up,,,,


In [8]:
merged_df.dtypes

App                        object
Category                   object
Rating                    float64
Reviews                    object
Size                       object
Installs                   object
Type                       object
Price                      object
Content Rating             object
Genres                     object
Last Updated               object
Current Ver                object
Android Ver                object
Translated_Review          object
Sentiment                  object
Sentiment_Polarity        float64
Sentiment_Subjectivity    float64
dtype: object

In [9]:
merged_df = merged_df[merged_df['Rating'] >= 4.2]

In [10]:
merged_df = merged_df[~merged_df['App'].str.contains(r'\d', regex=True)]

In [11]:
merged_df['Reviews'] = pd.to_numeric(merged_df['Reviews'], errors='coerce')
merged_df = merged_df[merged_df['Reviews'] > 1000]

In [12]:
merged_df['Size'] = merged_df['Size'].replace('Varies with device', np.nan)

def convert_size(size_str):
    if pd.isna(size_str):
        return np.nan
    if 'M' in size_str:
        return float(size_str.replace('M', ''))
    elif 'k' in size_str:
        return float(size_str.replace('k', '')) / 1024  # Convert kB to MB
    else:
        return np.nan

merged_df['Size_MB'] = merged_df['Size'].apply(convert_size)
merged_df = merged_df[(merged_df['Size_MB'] >= 20) & (merged_df['Size_MB'] <= 80)]

In [13]:
merged_df['Category'] = (
    merged_df['Category']
    .str.strip()
    .str.replace('_', ' ')
)

In [14]:
merged_df = merged_df[merged_df['Category'].str.startswith(('T', 'P'))]

In [15]:
category_translation = {
    "Travel And Local": "Voyage et local",   # French
    "Productivity": "Productividad",         # Spanish
    "Photography": "写真",                     # Japanese
    "Tools": "Tools",                        # leave untranslated (or give custom)
    "Personalization": "Personalización",    # Spanish
    "Parenting": "Parentalité"               # French (or choose another)
}

merged_df['Category_Translated'] = merged_df['Category'].map(category_translation)

In [16]:
missing = merged_df[merged_df['Category_Translated'].isna()]['Category'].unique()
print("Missing Translations:", missing)

Missing Translations: ['PHOTOGRAPHY' 'TRAVEL AND LOCAL' 'TOOLS' 'PERSONALIZATION' 'PRODUCTIVITY'
 'PARENTING']


In [17]:
#Step 3:Data Transmission

In [18]:
merged_df['Last_Updated_DT'] = pd.to_datetime(merged_df['Last Updated'], errors='coerce')

In [19]:
merged_df['Year_Month'] = merged_df['Last_Updated_DT'].dt.to_period('M').dt.to_timestamp()


In [20]:
merged_df['Installs'] = (
    merged_df['Installs'].astype(str)
    .str.replace('[+,]', '', regex=True)
    .astype(float)
)

In [21]:
grouped_df = (
    merged_df.groupby(['Year_Month', 'Category_Translated'])['Installs']
    .sum()
    .reset_index()
)

In [22]:
grouped_df = grouped_df.sort_values(by=['Category_Translated', 'Year_Month'])

In [23]:
grouped_df['Cumulative_Installs'] = (
    grouped_df.groupby('Category_Translated')['Installs'].cumsum()
)

In [24]:
grouped_df['Prev_Month_Installs'] = (
    grouped_df.groupby('Category_Translated')['Installs'].shift(1)
)

In [25]:
grouped_df['MoM_Change_Percent'] = (
    (grouped_df['Installs'] - grouped_df['Prev_Month_Installs']) /
    grouped_df['Prev_Month_Installs']
) * 100

In [26]:
grouped_df['Highlight'] = grouped_df['MoM_Change_Percent'] > 25

In [27]:
print(grouped_df.columns.tolist())
print(grouped_df.head())

['Year_Month', 'Category_Translated', 'Installs', 'Cumulative_Installs', 'Prev_Month_Installs', 'MoM_Change_Percent', 'Highlight']
Empty DataFrame
Columns: [Year_Month, Category_Translated, Installs, Cumulative_Installs, Prev_Month_Installs, MoM_Change_Percent, Highlight]
Index: []


In [28]:
#Step 5:Plotting Graph

In [29]:
html_files_paths="./"
if not os.path.exists(html_files_paths):
    os.makedirs(html_files_path)

In [30]:
plot_containers=""

In [31]:
def save_plot_as_html(fig,filename,insight):
    global plot_containers
    filepath= os.path.join(html_files_paths,filename)
    html_content=pio.to_html(fig,full_html=False,include_plotlyjs='inline')
    plot_containers+= f"""
    <div class ="plot-container" id="{filename}" onclick ="openPlot('{filename}')">
       <div class="plot">{html_content}</div> 
       <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath,full_html=False,include_plotlyjs="inline")

In [32]:
ist = pytz.timezone("Asia/Kolkata")
now = datetime.now(ist)

if 16 <= now.hour < 18:

    fig5 = px.area(
        grouped_df,
        x='Year_Month',
        y='Cumulative_Installs',
        color='Category_Translated',
        line_group='Category_Translated',
        labels={'Cumulative_Installs': 'Cumulative Installs', 'Year_Month': 'Month'},
        title='Cumulative App Installs Over Time by Category'
    )
    
    for i, row in grouped_df.iterrows():
        if row['Highlight']:
            fig5.add_vrect(
                x0=str(row['Year_Month']),
                x1=str(row['Year_Month']),
                fillcolor="yellow",
                opacity=0.3,
                layer="below",
                line_width=0
            )
    
    fig5.update_layout(
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white'
    )

    fig5.update_yaxes(title="Cumulative Installs", tickformat="~s")

    fig5.show()
    save_plot_as_html(
        fig5,
        "Cumulative_Installs_Over_Time.html",
        "Stacked area chart showing cumulative installs per category over time. Months with >25% growth are highlighted."
    )

else:
    print("Graph is only available between 4 PM and 6 PM IST.")

Graph is only available between 4 PM and 6 PM IST.


In [33]:
plot_containers_split=plot_containers.split('</div')

In [34]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [35]:
dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
     <meta charset="UTF-8">
     <meta name=viewport" content="width=device-width,initial-scale-1.0">
     <title>Google Play Store Review Analytics</title>
     <style>
        body {{ 
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
         }}
         .header{{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img{{
            margin: 0 10px;
            height: 50px;
        }}
        .container{{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container{{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights{{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights{{ 
            display: block;
        }}
        </style>
        <script>
             function openPlot(filename){{
                 window.open(filename,'_blank');
                 }}
        </script>
    </head>
    <body>
        <div class= "header">
            <img src="https://www.keyweo.com/wp-content/uploads/2022/04/google-logo-history.jpg" alt="Google logo">
            <h1>Google Play Store Review Analytics</h1>
            <img src="https://www.sociocs.com/images/badge-google-play.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
    """

In [36]:
plot_width=400
plot_height=300
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}
