In [None]:
# Google Play Store Apps Analysis

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("data/googleplaystore.csv")

# Quick look
print("Dataset Shape:", df.shape)
print(df.head())

# ----------------------------
# Data Cleaning
# ----------------------------
# Drop duplicates
df.drop_duplicates(inplace=True)

# Drop rows with missing values in important columns
df.dropna(subset=['Category','Rating','Installs','Type','Price'], inplace=True)

# Convert installs to int
df['Installs'] = df['Installs'].str.replace('+','').str.replace(',','').astype(int)

# Convert price to float
df['Price'] = df['Price'].str.replace('$','').astype(float)

# ----------------------------
# Analysis
# ----------------------------

# 1. Top Categories by Number of Apps
top_categories = df['Category'].value_counts().head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=top_categories.index, y=top_categories.values, palette="viridis")
plt.title("Top Categories by Number of Apps")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("results/top_categories.png")
plt.show()

# 2. Average Rating per Category
avg_rating = df.groupby('Category')['Rating'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=avg_rating.index, y=avg_rating.values, palette="coolwarm")
plt.title("Top 10 Categories by Average Rating")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("results/top_ratings.png")
plt.show()

# 3. Free vs Paid Apps Count
plt.figure(figsize=(5,5))
df['Type'].value_counts().plot.pie(autopct='%1.1f%%', colors=['skyblue','lightgreen'])
plt.title("Free vs Paid Apps")
plt.ylabel("")
plt.savefig("results/free_vs_paid.png")
plt.show()

# 4. Paid Apps Revenue Potential
revenue = df[df['Type']=='Paid'].groupby('Category')['Price'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=revenue.index, y=revenue.values, palette="magma")
plt.title("Top Categories by Paid App Price Revenue Potential")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("results/revenue_potential.png")
plt.show()
