In [1]:
import numpy as np 
import pandas as pd
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
df=pd.read_csv("/kaggle/input/google-play-store-apps/googleplaystore.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/google-play-store-apps/googleplaystore.csv'

# Initial Inspection

In [None]:
print(df.shape)
df.sample(6)

(10841, 13)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
7546,CM AppLock,TOOLS,4.1,843,3.6M,"100,000+",Free,0,Everyone,Tools,"February 25, 2016",2.0.0,4.0 and up
3034,PGA TOUR LIVE,SPORTS,3.4,1845,12M,"100,000+",Free,0,Everyone,Sports,"June 7, 2017",1.3.0,4.2 and up
4238,Text Free: WiFi Calling App,SOCIAL,4.2,83474,Varies with device,"5,000,000+",Free,0,Everyone,Social,"July 24, 2018",Varies with device,Varies with device
2853,Google Photos,PHOTOGRAPHY,4.5,10858538,Varies with device,"1,000,000,000+",Free,0,Everyone,Photography,"August 6, 2018",Varies with device,Varies with device
4206,H-Kakashi - theme Xperia™,PERSONALIZATION,4.6,621,5.4M,"50,000+",Free,0,Everyone,Personalization,"December 1, 2017",a.2,4.4 and up
7692,Evolution CP,FAMILY,3.6,13,4.7M,500+,Free,0,Everyone,Entertainment,"September 4, 2016",1.0.2,4.0.3 and up


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [None]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


# Data Cleaning

**Steps Perform**

1) Removed a known shifted row where category data was missing.

2) Converted Reviews, Price, and Installs to numeric types by removing symbols like +, ,, and $.

3) Converted 'M' (Megabytes) and 'k' (Kilobytes) into bytes. Handled 'Varies with device' as NaN.

4) Dropped duplicates and rows with missing Rating to ensure accurate visualizations.

In [None]:
#Faulty row
if '1.9' in df['Category'].unique():
    df = df[df['Category'] != '1.9'].copy()
    
#Cleaning 'Installs'
df['Installs'] = df['Installs'].astype(str).str.replace('+', '', regex=False)
df['Installs'] = df['Installs'].str.replace(',', '', regex=False)
df['Installs'] = pd.to_numeric(df['Installs'])

#Cleaning 'Price'
df['Price'] = df['Price'].astype(str).str.replace('$', '', regex=False)
df['Price'] = pd.to_numeric(df['Price'])

#Cleaning 'Reviews'
def clean_size(size):
    size_str = str(size)
    if 'M' in size_str:
        return float(size_str.replace('M', '')) * 1e6
    elif 'k' in size_str:
        return float(size_str.replace('k', '')) * 1e3
    elif 'Varies with device' in size_str:
        return np.nan
    else:
        try:
            return float(size_str)
        except ValueError:
            return np.nan

df['Size'] = df['Size'].apply(clean_size)

#Droping Duplicates
df.drop_duplicates(subset='App', keep='first', inplace=True)

#Droping rows with missing Rating
df_clean = df.dropna(subset=['Rating']).copy()

# EDA using Plotly

**Q1) What are the top 10 most popular App Categories?**

Family (1,608), Game (912), and Tools (718) are the most popular categories.


**Q2) How are App Ratings distributed?**

Most apps have high ratings. The average rating is 4.17, with a peak around 4.3 – 4.5.


**Q3) What is the proportion of Free vs Paid apps?**

The store is heavily dominated by Free apps (~92.6%).


**Q4) Does the App Size affect the number of Installs?**

There is a weak positive correlation (0.13). While massive apps get installs, smaller apps are just as capable of being "hits."


**Q5) What is the relationship between Rating, Reviews, and Installs?**

High ratings act as a gateway; as ratings improve, the density of reviews and installs increases significantly.


**Q6) How does Content Rating affect App Ratings?**

Adults only 18+ apps have the highest average rating (4.30), while Mature 17+ apps have the lowest (4.12).


**Q7) Which Categories have the most Total Installs?**

GAME: ~13.87 Billion installs
COMMUNICATION: ~11.03 Billion installs
TOOLS: ~7.99 Billion installs
PRODUCTIVITY: ~5.79 Billion installs
SOCIAL: ~5.48 Billion installs


**Q8) How "Fresh" are the apps? (Updates over Time)**

2018 (Peak Year): Over 5,450 apps (approx. 66% of the dataset) were updated in 2018.
2017: ~1,426 apps.
Before 2016: Very few apps (less than 15% of the total) had not been updated for more than two years.


**Q9) Which Genres have the highest average Ratings?**

Puzzle: 4.37 / 5.0
Art & Design: 4.36 / 5.0
Books & Reference: 4.34 / 5.0
Personalization: 4.33 / 5.0
Education: 4.29 / 5.0

In [None]:
#1
top_cats = df_clean['Category'].value_counts().head(10).reset_index()
top_cats.columns = ['Category', 'Count']

fig1 = px.bar(top_cats, x='Category', y='Count', 
              title='Q1) Top 10 App Categories by Count',
              color='Count', template='plotly_dark')
fig1.show()


#2
fig2 = px.histogram(df_clean, x='Rating', nbins=30, 
                    title='Q2) Distribution of App Ratings',
                    marginal='box', # Adds a box plot at the top
                    template='plotly_white')
fig2.show()


#3
fig3 = px.pie(df_clean, names='Type', 
              title='Q3) Proportion of Free vs Paid Apps',
              hole=0.4)
fig3.show()



#4
fig4 = px.scatter(df_clean, x='Size', y='Installs', color='Type',
                  log_x=True, log_y=True,
                  title='Q4) App Size vs Installs (Log Scale)',
                  hover_data=['App'])
fig4.show()



#5
df_clean['Reviews'] = pd.to_numeric(df_clean['Reviews'], errors='coerce')
df_clean['Installs'] = pd.to_numeric(df_clean['Installs'], errors='coerce')
df_clean = df_clean.dropna(subset=['Reviews', 'Installs'])

df_clean['Log_Reviews'] = np.log1p(df_clean['Reviews'])
df_clean['Log_Installs'] = np.log1p(df_clean['Installs'])

fig5 = px.scatter_3d(df_clean, x='Rating', y='Log_Reviews', z='Log_Installs',
                     color='Type', opacity=0.7,
                     title='Q5: 3D Relationship: Rating vs Reviews vs Installs',
                     labels={'Log_Reviews': 'Log(Reviews)', 'Log_Installs': 'Log(Installs)'})
fig5.show()

In [None]:
#6
fig6 = px.box(df_clean, x='Content Rating', y='Rating', color='Content Rating',
              title='Q6) App Rating Distribution by Content Rating',
              points="outliers")
fig6.show()



#7
cat_installs = df_clean.groupby('Category')['Installs'].sum().reset_index()
top_cat_installs = cat_installs.sort_values(by='Installs', ascending=False).head(10)

fig7 = px.bar(top_cat_installs, x='Category', y='Installs',
              title='Q7) Total Installs by Category',
              color='Installs',
              template='plotly_dark')
fig7.show()



#8
df_clean['Last Updated'] = pd.to_datetime(df_clean['Last Updated'])
df_clean['Update_Year'] = df_clean['Last Updated'].dt.year
updates_by_year = df_clean['Update_Year'].value_counts().reset_index()
updates_by_year.columns = ['Year', 'Count']
updates_by_year = updates_by_year.sort_values('Year')

fig8 = px.line(updates_by_year, x='Year', y='Count', markers=True,
               title='Q8) Number of App Updates Over Years',
               template='plotly_white')
fig8.show()



#9
genre_stats = df_clean.groupby('Genres')['Rating'].agg(['mean', 'count']).reset_index()
reliable_genres = genre_stats[genre_stats['count'] > 50].sort_values(by='mean', ascending=False).head(10)

fig9 = px.bar(reliable_genres, x='Genres', y='mean', 
               title='Q9) Top 10 Highest Rated Genres (with >50 Apps)',
               color='mean', 
               labels={'mean': 'Average Rating'},
               range_y=[4, 4.5], # Zooming in to see differences
               template='plotly_white')
fig9.show()

# Feature Selection & Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

features = ['Category', 'Type', 'Price', 'Size', 'Installs', 'Content Rating']
X = df_clean[features].copy()
y = df_clean['Rating']

#Encoding Categorical Data
le = LabelEncoder()
for col in ['Category', 'Type', 'Content Rating']:
    X[col] = le.fit_transform(X[col].astype(str))

#Filling any remaining NaNs in Size (using the median)
X['Size'] = X['Size'].fillna(X['Size'].median())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training (3 Models)
1. Linear Regression
2. Random Forest Regressor
3. XgBoost Regressor

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

print(f"Linear Regression MAE: {mae:.2f}") 
# An MAE of 0.35 means our predictions are usually within 0.35 stars of the real rating.

Linear Regression MAE: 0.39


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

X['Size'] = X['Size'].fillna(X['Size'].median())

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['Category', 'Type', 'Content Rating']:
    X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_preds)

print(f"Random Forest MAE: {rf_mae:.4f}")

Random Forest MAE: 0.3876


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

for col in ['Category', 'Type', 'Content Rating']:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

xgb_preds = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, xgb_preds)

print(f"XGBoost MAE: {mae:.4f}")

XGBoost MAE: 0.3686


# Summary
This Notebook provides an exploratory analysis and predictive modeling of the Google Play Store dataset

Data Preparation: The analysis starts by cleaning a dataset of 10,841 apps. This includes removing faulty rows, converting columns (like 'Installs' and 'Price') to numeric formats, and handling app size units (MB/kB).

Exploratory Data Analysis (EDA): Using Plotly, the notebook answers several key questions.

Predictive Modeling: The notebook concludes by training machine learning models, including Linear Regression, Random Forest and XGBoost, to predict app ratings.