In [8]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

In [9]:
app_data = pd.read_csv("D:/Google Play Store Intern/Clean Play Store Data.csv")

In [21]:
review_data = pd.read_csv("D:/Google Play Store Intern/Clean User Reviews.csv")

In [11]:
app_data.shape, review_data.shape

(9659, 16)

In [13]:
html_file_path = 'C:\\Users\\PUNEET\\Desktop\\ml\\Google Play Store\\plotly_graph1'
if not os.path.exists(html_file_path):
    os.makedirs(html_file_path)

In [14]:
# Save each plotly Figure to a html file
plot_containers = ''
def save_Plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_file_path, filename)
    html_content = pio.to_html(fig,full_html = False, include_plotlyjs = 'inline')
    # Append the plot and its insights into plot_containers
    plot_containers += f'''
    <div class = "plot-container" id = "{filename}" onclick = "openPlot('{filename}')">
        <div class = "plot">{html_content}</div>
        <div class = "insights">{insight}</div>
    </div>
    '''
    fig.write_html(filepath, full_html = False, include_plotlyjs = 'inline')
    

In [15]:
plot_width = 500,
plot_height = 400,
plot_bg_color = 'black',
text_color = '#0E2148',
title_font = {'size':16},
axis_font = {'size':12}

In [16]:
# Figure 1: Top 10 app categories on playstore
Category_count = app_data['Category'].value_counts().nlargest(10)
# print(Category_count.index,Category_count.values)
fig1 = px.bar(
    x = Category_count.index,
    y = Category_count.values,
    labels = {'x': 'Category', 'y': 'Count'},
    title = 'Top Categories on PlayStore',
    color = Category_count.index,
    color_discrete_sequence = px.colors.qualitative.Plotly,
    width = 500,
    height = 400
)

fig1.update_layout(
    plot_bgcolor='#7F8CAA',
    paper_bgcolor='#898AC4',
    font_color='#0E2148',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

save_Plot_as_html(fig1,'Category graph 1.html', 'The top category on the play store are dominated by tools, entertainment and productivity apps')

In [17]:
# Figure 2 -> Pie Chart of Free vs Paid App on Playstore
Type_count = app_data['Type'].value_counts()
# print(Type_count)
fig2 = px.pie(
    names = Type_count.index,
    values = Type_count.values,
    title = 'App type distribution',
    color_discrete_sequence = px.colors.qualitative.Plotly,
    width = 500,
    height = 400
)

fig2.update_layout(
    plot_bgcolor='#7F8CAA',
    paper_bgcolor='#898AC4',
    font_color='#0E2148',
    title_font={'size': 16},
    margin=dict(l=10, r=10, t=30, b=10)
)
save_Plot_as_html(fig2,'Type graph 2.html', 'Mostly free apps on playstore are free, Indicating a strategy to attracting users first and nonetize through ads or in apps purcheses')

In [18]:
# Figure 3 -> Histogram Plot of Rating Distribution
fig3 = px.histogram(
    app_data,
    x = 'Rating',
    nbins = 20,
    title = 'Rating distribution',
    color_discrete_sequence = ['#3E4CAC'],
    width = 500,
    height = 400
)

fig3.update_layout(
    plot_bgcolor='#7F8CAA',
    paper_bgcolor='#898AC4',
    font_color='#0E2148',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_Plot_as_html(fig3,'Rating graph 3.html', 'rating are skewed towards higher values, suggesting that most apps are rated favorably by users')

In [24]:
# Figure 4: Sentiment Bar Graph
Sentiment_count = review_data['Avg_Sentiment_Score'].value_counts()
fig4 = px.bar(
    x = Sentiment_count.index,
    y = Sentiment_count.values,
    labels = {'x': 'Sentiment Score', 'y': 'Count'},
    title = 'Sentiment Distribution',
    color = Sentiment_count.index,
    color_discrete_sequence = ['#3E4CAC'],
    width = 500,
    height = 400
)

fig4.update_layout(
    plot_bgcolor='#7F8CAA',
    paper_bgcolor='#898AC4',
    font_color='#0E2148',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

save_Plot_as_html(fig4,'Sentiment graph 4.html', 'Sentiment in review show a mix of positive and negative feedback, with a slight lean towards positive sentiments')

In [25]:
# Figure 5: Install over categories bar chart 
installs_by_Category= app_data.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x = installs_by_Category.index,
    y = installs_by_Category.values,
    labels = {'x': 'Category', 'y': 'install'},
    title = 'Top App Installs by Category',
    color = installs_by_Category.index,
    color_discrete_sequence = px.colors.sequential.Blues,
    width = 500,
    height = 400
)

fig5.update_layout(
    plot_bgcolor='#7F8CAA',
    paper_bgcolor='#898AC4',
    font_color='#0E2148',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

save_Plot_as_html(fig5,'Installs graph 5.html', 'The Category with the most install are games and social app, reflecting their Broad Appeal and daily usage')

In [26]:
# Figure 6: update apps over time analysis
Updates_per_year= app_data['Update_Year'].value_counts().sort_index()
fig6 = px.line(
    x=Updates_per_year.index,
    y=Updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of App Updates Over the Years',
    color_discrete_sequence=['#3E4CAC'],
    width=500,
    height=400
)

save_Plot_as_html(fig6,'app_update graph 6.html', 'apps Updates has been over increasing over the Year, it showing the developer actively Maintaining and improving their apps')

In [27]:
# Figure 7: Revenue by apps bar chart 
Revenue_by_category = app_data.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x = Revenue_by_category.index,
    y = Revenue_by_category.values,
    labels = {'x':'App Categories','y':'Revenue'},
    title = 'Revenue By Categories',
    color = Revenue_by_category.index,
    color_discrete_sequence = px.colors.sequential.Greens,
    width = 500,
    height = 400
)
fig7.update_layout(
    plot_bgcolor = '#7F8CAA',
    paper_bgcolor = '#898AC4',
    font_color = '#0E2148',
    title_font = {'size':16},
    xaxis = dict(title_font = {'size':14}),
    yaxis = dict(title_font = {'size':14}),
    margin = dict(l=10,r=10,t=30,b=10)
)
save_Plot_as_html(fig7,'Revenue graph 7.html', 'Family and lifestyle App Category generate more revenue, indicating their monetizing potential')

In [28]:
# Figure 8 : Top Genres 
genre_count = app_data['Genres'].str.split(';',expand = True).stack().value_counts().nlargest(10)
fig8 = px.bar(
    x = genre_count.index,
    y = genre_count.values,
    labels = {'x':'Genre','y':'Count'},
    title = 'Genres Count',
    color = genre_count.index,
    color_discrete_sequence = px.colors.sequential.RdPu,
    width = 500,
    height = 400
)
fig8.update_layout(
    plot_bgcolor = '#7F8CAA',
    paper_bgcolor = '#898AC4',
    font_color = '#0E2148',
    title_font = {'size':16},
    xaxis = dict(title_font = {'size':14}),
    yaxis = dict(title_font = {'size':14}),
    margin = dict(l=10,r=10,t=30,b=10)
)
save_Plot_as_html(fig8,'Genre graph 8.html', 'Tools and education genre are the most common, reflecting users preference for engaging and easy to use')

In [29]:
# Figure 9: Rating on apps chart 
fig9 = px.scatter(
    app_data,
    x = 'Last Updated',
    y = 'Rating',
    color = 'Type',
    labels = {'x':'Genre','y':'Count'},
    title = 'Impact of last update on rating',
    color_discrete_sequence = px.colors.qualitative.Vivid,
    width = 500,
    height = 400
)
fig9.update_layout(
    plot_bgcolor = '#7F8CAA',
    paper_bgcolor = '#898AC4',
    font_color = '#0E2148',
    title_font = {'size':16},
    xaxis = dict(title_font = {'size':14}),
    yaxis = dict(title_font = {'size':14}),
    margin = dict(l=10,r=10,t=30,b=10)
)
save_Plot_as_html(fig9,'Rating graph 9.html', 'The Scatter plot show the correlation between Rating and Last update, Suggesting the more frequent updates for better user experience')

In [30]:
# Figure 10: Paid Vs Free apps analysis
fig10 = px.box(
    app_data,
    x = 'Type',
    y = 'Rating',
    color = 'Type',
    title = 'Rating for Paid vs Free App',
    color_discrete_sequence = px.colors.qualitative.Vivid,
    width = 500,
    height = 400
)
fig10.update_layout(
    plot_bgcolor = '#7F8CAA',
    paper_bgcolor = '#898AC4',
    font_color = '#0E2148',
    title_font = {'size':16},
    xaxis = dict(title_font = {'size':14}),
    yaxis = dict(title_font = {'size':14}),
    margin = dict(l=10,r=10,t=30,b=10)
)
save_Plot_as_html(fig10,'Paid Vs Free graph 10.html', 'The Paid Apps Generally have Higher rating compare to Free Apps, Suggesting That Users expect Higher quality for their pay for apps')

In [31]:
plot_containers_Split = plot_containers.split('</div>')

In [32]:
if len(plot_containers_Split) > 1:
    # Take the second last split segment and close the div tag
    final_plot = plot_containers_Split[-2] + '</div>'
else:
    # Use the whole original container string if there's no second last part
    final_plot = plot_containers

In [33]:
# webpage for showing all charts
# Adjust height, width and style for all charts
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Google Play Store Review Analytics</title>
    <style>
        body{{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            padding: 0;
            margin: 0;
        }}
        .header{{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
  </head>
  <body>
      <div class = "header">
          <img src="https://upload.wikimedia.org/wikipedia/commons/2/2f/Google_2015_logo.svg" alt="Google Logo" height="50">
          <h1>Google Play Store Review Analytics</h1>
          <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Logo" height="50">
      </div>
      <div class = "container">
          {plots}
      </div>
  </body>
</html>
"""

In [34]:
final_html = dashboard_html.format(plots=plot_containers,plot_width=plot_width,plot_height=plot_height)

In [35]:
dashboard_path = os.path.join(html_file_path, "web page.html")

In [36]:
with open(dashboard_path, "w", encoding = "utf-8") as f:
    f.write(final_html)

In [38]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

True