Import Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from datetime import datetime
import pytz
import random

Load and Clean Data

In [2]:
df = pd.read_csv(r'C:\Users\vishal\Desktop\Dataset\Play Store Data.csv') 
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [3]:
df = df.dropna(subset=['Installs', 'Category'])
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True)
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

Time Visibility Function

In [4]:
# Function to check if current time is between 6 PM and 8 PM IST
def is_visible_time(start_hour=18, end_hour=20):
    ist = pytz.timezone('Asia/Kolkata')
    current_time = datetime.now(ist).hour
    return start_hour <= current_time < end_hour

Apply Filters

In [5]:
# Filter categories not starting with A, C, G, or S
df = df[~df['Category'].str.startswith(tuple("ACGS"))]

# Group by Category for total installs
category_installs = df.groupby('Category')['Installs'].sum().reset_index()

# Get top 5 categories by installs
top5 = category_installs.sort_values(by='Installs', ascending=False).head(5)
top_categories = top5['Category'].tolist()

# Filter dataframe to only top 5 categories
filtered_df = df[df['Category'].isin(top_categories)]

Assign Countries & Group

In [6]:
# Assign dummy countries to simulate global spread
countries = ['United States', 'India', 'Germany', 'Brazil', 'Australia']
filtered_df['Country'] = filtered_df['Category'].apply(lambda x: random.choice(countries))
# Group by category and country
grouped = filtered_df.groupby(['Category', 'Country'])['Installs'].sum().reset_index()
grouped['Highlight'] = grouped['Installs'] > 1_000_000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Country'] = filtered_df['Category'].apply(lambda x: random.choice(countries))


Display Choropleth (Only between 6 PM and 8 PM IST)

In [7]:
# Display only during 6 PM to 8 PM IST
if is_visible_time():
    fig = px.choropleth(
        grouped,
        locations="Country",
        locationmode="country names",
        color="Installs",
        hover_name="Category",
        animation_frame="Category",
        color_continuous_scale="Turbo",
        title="Top 5 Global App Categories (Filtered) by Installs"
    )

    fig.update_layout(
        geo=dict(showframe=True, showcoastlines=True),
        title_font_size=18
    )
    
    fig.show()
else:
    print("Choropleth Map is only visible between 6 PM and 8 PM IST.")
