In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import nltk
import webbrowser
import os

In [2]:
apps_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')
apps_df.head(), reviews_df.head()


(                                                 App        Category  Rating  \
 0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
 1                                Coloring book moana  ART_AND_DESIGN     3.9   
 2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
 3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
 4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   
 
   Reviews  Size     Installs  Type Price Content Rating  \
 0     159   19M      10,000+  Free     0       Everyone   
 1     967   14M     500,000+  Free     0       Everyone   
 2   87510  8.7M   5,000,000+  Free     0       Everyone   
 3  215644   25M  50,000,000+  Free     0           Teen   
 4     967  2.8M     100,000+  Free     0       Everyone   
 
                       Genres      Last Updated         Current Ver  \
 0               Art & Design   January 7, 2018               1.0.0   
 1  Art 

In [3]:
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)


In [4]:
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
filtered_apps = apps_df[apps_df['Reviews']>1000]

In [5]:
top_categories = filtered_apps['Category'].value_counts().head(5).index
filtered_apps = filtered_apps[filtered_apps['Category'].isin(top_categories)]

In [6]:
merged_df = pd.merge(filtered_apps, reviews_df, on='App', how='inner')
merged_df = merged_df.dropna(subset=['Sentiment'])
merged_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Candy Crush Saga,GAME,4.4,22426677,74M,"500,000,000+",Free,0,Everyone,Casual,"July 5, 2018",1.129.0.2,4.1 and up,"If get free lives refill, continue accumulate ...",Positive,0.374411,0.556987
1,Candy Crush Saga,GAME,4.4,22426677,74M,"500,000,000+",Free,0,Everyone,Casual,"July 5, 2018",1.129.0.2,4.1 and up,My original rating 01/2015 5 Stars still holdi...,Positive,0.25,0.475
2,Candy Crush Saga,GAME,4.4,22426677,74M,"500,000,000+",Free,0,Everyone,Casual,"July 5, 2018",1.129.0.2,4.1 and up,"This good time passing game. However, I like l...",Positive,0.200926,0.437963
3,Candy Crush Saga,GAME,4.4,22426677,74M,"500,000,000+",Free,0,Everyone,Casual,"July 5, 2018",1.129.0.2,4.1 and up,"Fun first, spending two weeks level makes want...",Positive,0.183333,0.296825
4,Candy Crush Saga,GAME,4.4,22426677,74M,"500,000,000+",Free,0,Everyone,Casual,"July 5, 2018",1.129.0.2,4.1 and up,Please get rid amount pop ups love things holy...,Positive,0.319444,0.6


In [7]:
def categorize_rating(rating):
    if rating >= 4:
        return '4-5 stars'
    elif rating >= 3:
        return '3-4 stars'
    else:
        return '1-2 stars'
    

In [8]:
merged_df['Rating Group']= merged_df['Rating'].apply(categorize_rating)

In [9]:
sentiment_counts = merged_df.groupby(['Rating Group','Sentiment']).size().unstack().fillna(0)
sentiment_counts

Sentiment,Negative,Neutral,Positive
Rating Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3-4 stars,169,125,351
4-5 stars,9034,1949,17356


In [10]:
sentiment_plot_df = sentiment_counts.reset_index().melt(id_vars=["Rating Group"], var_name="Sentiment", value_name="Count")

In [11]:
fig = px.bar(
    sentiment_plot_df,
    x="Rating Group",
    y="Count",
    color="Sentiment",
    title="Sentiment Distribution by Rating Group",
    color_discrete_map={"Positive": "green", "Neutral": "gray", "Negative": "red"},
    text_auto=True,
    barmode="stack"
)

In [12]:
chart_path = "sentiment_distribution.html"
fig.write_html(chart_path)

In [13]:
display_html = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sentiment Distribution Chart</title>
    <script>
        function openChart() {
            window.open("sentiment_distribution.html", "_blank");
        }
    </script>
</head>
<body>
    <h2>Sentiment Distribution Chart</h2>
    <button onclick="openChart()">Open Chart</button>
</body>
</html>
'''

In [14]:
with open("display_chart.html", "w") as file:
    file.write(display_html)

print("Files generated successfully: display_chart.html & sentiment_distribution.html")

Files generated successfully: display_chart.html & sentiment_distribution.html
