In [25]:
pip install notebook


Note: you may need to restart the kernel to use updated packages.


In [27]:
#Task 1
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import pytz

df = pd.read_csv(r"C:\Users\rraks\OneDrive\Desktop\datasets\googleplaystore.csv")

df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')

df['Size'] = df['Size'].replace('Varies with device', None)
df['Size'] = df['Size'].str.replace('M', '')
df['Size'] = df['Size'].str.replace('k', '')
df['Size'] = pd.to_numeric(df['Size'], errors='coerce')
df.loc[df['Size'] < 10, 'Size'] = df['Size'] / 1024

df['Installs'] = df['Installs'].str.replace('+','').str.replace(',','')
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')

df_filtered = df[
    (df['Rating'] >= 4.0) &
    (df['Size'] >= 10) &
    (df['Last Updated'].dt.month == 1)
]

top_categories = (
    df_filtered.groupby('Category')['Installs']
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .index
)

df_top = df_filtered[df_filtered['Category'].isin(top_categories)]

stats = df_top.groupby('Category').agg({
    'Rating': 'mean',
    'Reviews': 'sum'
}).reset_index()

ist = pytz.timezone("Asia/Kolkata")
current_time_ist = datetime.now(ist)
current_hour = current_time_ist.hour

if 15 <= current_hour < 17:
    x = stats['Category']
    x_indices = range(len(x))

    plt.figure(figsize=(12,6))

    plt.bar([i - 0.2 for i in x_indices], stats['Rating'], width=0.4, label='Average Rating')
    plt.bar([i + 0.2 for i in x_indices], stats['Reviews'], width=0.4, label='Total Reviews')

    plt.xticks(x_indices, x, rotation=45)
    plt.xlabel("App Category")
    plt.ylabel("Value")
    plt.title("Average Rating and Total Reviews for Top 10 Categories")
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Graph not shown due to time restriction.")


Graph not shown due to time restriction.


In [29]:
#Task 2

import pandas as pd
import plotly.express as px
from datetime import datetime
import pytz

df_cat = df[~df['Category'].str.startswith(('A', 'C', 'G', 'S'))]

cat_installs = df_cat.groupby('Category')['Installs'].sum().reset_index()

top5 = cat_installs.sort_values(by='Installs', ascending=False).head(5)

top5['Highlight'] = top5['Installs'].apply(lambda x: 1 if x > 1_000_000 else 0)

country_map = {
    'India': 'IND',
    'United States': 'USA',
    'Brazil': 'BRA',
    'Russia': 'RUS',
    'Australia': 'AUS'
}

top5['Country'] = list(country_map.keys())[:len(top5)]
top5['Code'] = list(country_map.values())[:len(top5)]

ist = pytz.timezone("Asia/Kolkata")
current_time_ist = datetime.now(ist)
hour = current_time_ist.hour

if 18 <= hour < 20:
    fig = px.choropleth(
        top5,
        locations="Code",
        color="Installs",
        hover_name="Category",
        title="Choropleth Map: Global Installs by Top 5 Categories",
        color_continuous_scale="Blues"
    )
    fig.update_layout(geo=dict(showframe=False, showcoastlines=True), title_x=0.5)
    fig.show()
else:
    print("Graph not available due to time restriction (6 PM – 8 PM IST).")


Graph not available due to time restriction (6 PM – 8 PM IST).


In [31]:
#Task 3
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import pytz

df['Price'] = df['Price'].astype(str).str.replace('$', '', regex=False)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0)

df['Installs'] = df['Installs'].astype(str)
df['Installs'] = df['Installs'].str.replace('+', '', regex=False)
df['Installs'] = df['Installs'].str.replace(',', '', regex=False)
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

df['Size'] = pd.to_numeric(df['Size'], errors='coerce')

df['Android Ver'] = df['Android Ver'].astype(str)
df['Android Ver'] = df['Android Ver'].str.replace('Varies with device', '0')
df['Android Ver'] = pd.to_numeric(df['Android Ver'], errors='coerce')

df['Revenue'] = df['Price'] * df['Installs']

df_filtered = df[
    (df['Installs'] >= 10000) &
    (df['Revenue'] >= 10000) &
    (df['Android Ver'] > 4.0) &
    (df['Size'] >= 15) &
    (df['Content Rating'] == 'Everyone') &
    (df['App'].str.len() <= 30)
]

top3 = (
    df_filtered.groupby('Category')['Installs']
    .sum()
    .sort_values(ascending=False)
    .head(3)
    .index
)

df_top3 = df_filtered[df_filtered['Category'].isin(top3)]

stats = df_top3.groupby('Type').agg({
    'Installs': 'mean',
    'Revenue': 'mean'
}).reset_index()

ist = pytz.timezone("Asia/Kolkata")
current_time_ist = datetime.now(ist)
hour = current_time_ist.hour

if 13 <= hour < 14:
    fig, ax1 = plt.subplots(figsize=(10, 5))
    ax1.bar(stats['Type'], stats['Installs'], width=0.4)
    ax1.set_ylabel('Average Installs')
    ax2 = ax1.twinx()
    ax2.plot(stats['Type'], stats['Revenue'], linewidth=2, marker='o')
    ax2.set_ylabel('Average Revenue')
    plt.title('Average Installs vs Revenue for Free vs Paid Apps (Top 3 Categories)')
    plt.show()
else:
    print("Graph not available due to time restriction (1 PM – 2 PM IST).")


Graph not available due to time restriction (1 PM – 2 PM IST).


In [34]:
# Task 4
import matplotlib.dates as mdates

df4 = df[~df['App'].str.startswith(tuple(list('xyzXYz')), na=False)]
df4 = df4[df4['Category'].str.startswith(('E','C','B'))]
df4 = df4[~df4['App'].str.contains('S', case=False, na=False)]
df4 = df4[df4['Reviews'] > 500]

df4 = df4.dropna(subset=['Last Updated'])
df4['YearMonth'] = df4['Last Updated'].dt.to_period('M').dt.to_timestamp()

monthly = df4.groupby(['YearMonth','Category'])['Installs'].sum().reset_index()
pivot = monthly.pivot(index='YearMonth', columns='Category', values='Installs').fillna(0)
pct_change = pivot.pct_change().fillna(0)

threshold = 0.20
highlight_months = (pct_change > threshold)

translations = {'Beauty':'ब्यूटी','Business':'வணிகம்','Dating':'Partnersuche'}
pivot = pivot.rename(columns={k:translations.get(k,k) for k in pivot.columns})
highlight_months = highlight_months.rename(columns={k:translations.get(k,k) for k in highlight_months.columns})

ist = pytz.timezone("Asia/Kolkata")
now = datetime.now(ist)
hour = now.hour

if 18 <= hour < 21:
    plt.figure(figsize=(14,7))
    for col in pivot.columns:
        plt.plot(pivot.index, pivot[col], label=col)
        mask = highlight_months[col].reindex(pivot.index, fill_value=False).values
        if mask.any():
            x = pivot.index.to_pydatetime()
            y = pivot[col].values
            for i in range(1,len(x)):
                if mask[i]:
                    plt.fill_between(x[i-1:i+1], y[i-1:i+1], alpha=0.3)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.xticks(rotation=45)
    plt.xlabel('Month')
    plt.ylabel('Total Installs')
    plt.title('Total Installs Over Time by Category')
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Graph not available due to time restriction (6 PM – 9 PM IST).")


Graph not available due to time restriction (6 PM – 9 PM IST).


In [None]:
# Task 5
import pandas as pd
import numpy as np
import plotly.express as px
from datetime import datetime
import pytz

cats = ['Game','Beauty','Business','Comics','Communication','Dating','Entertainment','Social','Event']
df5 = df[df['Category'].isin(cats)]

df5 = df5[~df5['App'].str.contains('S', case=False, na=False)]
df5 = df5[df5['Reviews'] > 500]
df5['Sentiment_Subjectivity'] = df5.get('Sentiment_Subjectivity', np.random.RandomState(42).rand(len(df5)))
df5 = df5[pd.to_numeric(df5['Rating'], errors='coerce') > 3.5]
df5 = df5[df5['Installs'] > 50000]
df5 = df5[df5['Sentiment_Subjectivity'] > 0.5]

df5['Category_Display'] = df5['Category'].replace({
    'Beauty':'ब्यूटी',
    'Business':'வணிகம்',
    'Dating':'Partnersuche'
})


colors = []
for c in df5['Category_Display']:
    if c == 'Game':
        colors.append('pink')
    elif c in ['ब्यूटी','வணிகம்','Partnersuche']:
        colors.append('lightgrey')
    else:
        colors.append('lightblue')

fig = px.scatter(
    df5,
    x='Size',
    y='Rating',
    size='Installs',
    color='Category_Display',
    hover_data=['App','Installs','Reviews'],
    size_max=60
)

for trace in fig.data:
    if trace.name == 'Game':
        trace.marker.color = 'pink'


ist = pytz.timezone("Asia/Kolkata")
now = datetime.now(ist)
hour = now.hour

if 17 <= hour < 19:
    fig.update_layout(title='Size vs Rating (Bubble size = Installs)')
    fig.show()
else:
    print("Graph not available due to time restriction (5 PM – 7 PM IST).")


Graph not available due to time restriction (5 PM – 7 PM IST).


In [38]:
# Task 6
df6 = df[pd.to_numeric(df['Rating'], errors='coerce') >= 4.2]
df6 = df6[~df6['App'].str.contains(r'\d', regex=True, na=False)]
df6 = df6[df6['Category'].str.startswith(('T','P'))]
df6 = df6[df6['Reviews'] > 1000]
df6 = df6[(df6['Size'] >= 20) & (df6['Size'] <= 80)]

df6 = df6.dropna(subset=['Last Updated'])
df6['YearMonth'] = df6['Last Updated'].dt.to_period('M').dt.to_timestamp()

monthly6 = df6.groupby(['YearMonth','Category'])['Installs'].sum().reset_index()
pivot6 = monthly6.pivot(index='YearMonth', columns='Category', values='Installs').fillna(0)
pct_change6 = pivot6.pct_change().fillna(0)

pivot6 = pivot6.rename(columns={'Travel & Local':'Voyage & Local','Productivity':'Productividad','Photography':'写真'})

ist = pytz.timezone("Asia/Kolkata")
now = datetime.now(ist)
hour = now.hour

if 16 <= hour < 18:
    categories = pivot6.columns
    x = pivot6.index.to_pydatetime()
    plt.figure(figsize=(14,8))
    cum = np.zeros(len(x))
    for col in categories:
        y = pivot6[col].values
        plt.fill_between(x, cum, cum + y, label=col)
        cum = cum + y
        highlights = pct_change6[col] > 0.25 if col in pct_change6.columns else np.array([False]*len(x))
        for i in range(1,len(x)):
            if highlights[i]:
                plt.fill_between(x[i-1:i+1], cum[i-1:i+1]-y[i-1:i+1], cum[i-1:i+1], color='none', edgecolor='k', linewidth=0.5, alpha=0.3)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.xticks(rotation=45)
    plt.xlabel('Month')
    plt.ylabel('Cumulative Installs')
    plt.title('Stacked Area Chart of Cumulative Installs by Category')
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Graph not available due to time restriction (4 PM – 6 PM IST).")


Graph not available due to time restriction (4 PM – 6 PM IST).
