In [None]:
# Cell 1: Setup
%pip install -q -U groq
%pip install python-dotenv

import pandas as pd
from google import genai
from google.genai import types
import json
import os

print("Libraries reinstalled and imported successfully!")

In [None]:
# Cell 2: Securely Connect Groq
import os
from dotenv import load_dotenv
from groq import Groq

# This looks for the .env file and loads its variables into your system
load_dotenv() 

# Now we pull the key from the system environment
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    print("❌ Error: GROQ_API_KEY not found. Check your .env file!")
else:
    client = Groq(api_key=GROQ_API_KEY)
    print("✅ Groq Client is ready and key is hidden!")

In [None]:
# Cell 3: Data Ingestion
print("Loading dataset...please wait.")

# Using 'usecols' to save RAM by ignoring metadata we dont need
df = pd.read_csv('Google-Playstore.csv', usecols=['Released', 'Category'])

# Basic Cleaning
df = df.dropna(subset=['Released', 'Category'])
df['Released'] = pd.to_datetime(df['Released'], errors='coerce')
df = df.dropna(subset=['Released'])
df['Year'] = df['Released'].dt.year

print(f"Success! Loaded {len(df):,} apps.")
df.head()

In [None]:
# Cell 4: AI Market Clustering
import json

# Getting unique categories from your dataframe
unique_categories = df['Category'].unique().tolist()

prompt = f"""
I have these Google Play Store categories: {unique_categories}
Group them into exactly 6 broad 'Market Clusters': Entertainment, Productivity, Lifestyle, Social, Education, Tools.
Return ONLY a JSON dictionary where the KEY is the original category and the VALUE is the cluster name.
Example format: {{"Dating": "Social", "Finance": "Productivity"}}
"""

print("Asking Groq to cluster categories... (This is ultra-fast)")

try:
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a data assistant that only outputs valid JSON. Do not include any intro or outro text."
            },
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama-3.3-70b-versatile",
        response_format={"type": "json_object"}  
    )

    # Converting the AI response string into a Python dictionary
    cluster_map = json.loads(chat_completion.choices[0].message.content)

    # Applying the AI's mapping to the dataframe
    df['Cluster'] = df['Category'].map(cluster_map).fillna('Other')

    print("AI Clustering complete!")
    # Show a sample to verify
    print(df[['Category', 'Cluster']].drop_duplicates().head(10))

except Exception as e:
    print(f"Error during AI clustering: {e}")
    print("Check your Groq API key or rate limits.")

In [None]:
# Cell 5: The "Race" Logic
# 1. Counting the new apps per year per cluster
yearly_growth = df.groupby(['Year', 'Cluster']).size().reset_index(name='New_Apps')

# 2. Pivot so that each Cluster is a column (Year is the index)
pivot_df = yearly_growth.pivot(index='Year', columns='Cluster', values='New_Apps').fillna(0)

# 3. Calculate Cumulative Sum (The Running Total)
cumsum_df = pivot_df.cumsum()

# 4. Melt back to "Long Format" for D3.js/Flourish
final_df = cumsum_df.reset_index().melt(id_vars='Year', var_name='name', value_name='value')
final_df['date'] = pd.to_datetime(final_df['Year'], format='%Y').dt.strftime('%Y-%m-%d')
final_df['category'] = final_df['name']

# Keep only modern era (2008+)
final_df = final_df[final_df['Year'] >= 2008]

final_df[['date', 'name', 'category', 'value']].to_csv('playstore_race_ready.csv', index=False)
print("File 'playstore_race_ready.csv' is ready for your animation!")

In [None]:
# Cell 6: AI Storyteller
# Finding the top 3 clusters in the most recent year
top_3_2024 = final_df[final_df['Year'] == 2024].nlargest(3, 'value')['name'].tolist()

prompt = f"In 2021, the top 3 Google Play clusters were [Entertainment, Productivity, Lifestyle]. Write a 1-sentence 'Breaking News' headline for a chart ending in 2021"

print("Asking Groq for a headline...")
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a data journalist. Write short, punchy headlines."
        },
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama-3.3-70b-versatile",
    temperature=0.7, 
)

headline = chat_completion.choices[0].message.content
print(f"\nAI Caption: {headline}")

In [None]:
# Cell 7: Final Pivot for Flourish
# Taking the 'Long' data and turning it into 'Wide' data
flourish_df = final_df.pivot(index=['name', 'category'], columns='date', values='value').reset_index()

# Save this version as a NEW file for Flourish
flourish_df.to_csv('playstore_FLOURISH_FINAL.csv', index=False)

print("Format fixed! Download 'playstore_FLOURISH_FINAL.csv' for Flourish.")
flourish_df.head()