# Google Play Store Apps - Data Analysis Project
## Notebook 1: Data Loading and Exploration

### 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')



### 2. Load Datasets

In [2]:
# Load main dataset
df_apps = pd.read_csv('googleplaystore.csv')

# Load user reviews dataset
df_reviews = pd.read_csv('googleplaystore_user_reviews.csv')

print("Apps Dataset Shape:", df_apps.shape)
print("Reviews Dataset Shape:", df_reviews.shape)

Apps Dataset Shape: (10841, 13)
Reviews Dataset Shape: (64295, 5)


### 3. Initial Data Exploration

In [3]:
# Display first few rows
print("=== Apps Dataset ===")
display(df_apps.head(10))

print("\n=== User Reviews Dataset ===")
display(df_reviews.head(10))

=== Apps Dataset ===


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19M,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,29M,"1,000,000+",Free,0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33M,"1,000,000+",Free,0,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3.1M,"10,000+",Free,0,Everyone,Art & Design;Creativity,"July 3, 2018",2.8,4.0.3 and up



=== User Reviews Dataset ===


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3
6,10 Best Foods for You,Amazing,Positive,0.6,0.9
7,10 Best Foods for You,,,,
8,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0
9,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0


### 4. Dataset Info and Structure

In [4]:
# Apps dataset information
print("=== Apps Dataset Info ===")
df_apps.info()

print("\n=== Apps Dataset Columns ===")
print(df_apps.columns.tolist())

print("\n=== Reviews Dataset Info ===")
df_reviews.info()

print("\n=== Reviews Dataset Columns ===")
print(df_reviews.columns.tolist())

=== Apps Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB

=== Apps Dataset Columns ===
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', '

### 5. Statistical Summary

In [5]:
# Numerical columns summary
print("=== Numerical Columns Summary ===")
display(df_apps.describe())

# Categorical columns summary
print("\n=== Categorical Columns Summary ===")
display(df_apps.describe(include='object'))

=== Numerical Columns Summary ===


Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0



=== Categorical Columns Summary ===


Unnamed: 0,App,Category,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,10841,10841,10841,10840,10841,10840,10841,10841,10833,10838
unique,9660,34,6002,462,22,3,93,6,120,1378,2832,33
top,ROBLOX,FAMILY,0,Varies with device,"1,000,000+",Free,0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,596,1695,1579,10039,10040,8714,842,326,1459,2451


### 6. Check Data Quality

In [6]:
# Missing values
print("=== Missing Values in Apps Dataset ===")
missing_apps = df_apps.isnull().sum()
missing_pct = (missing_apps / len(df_apps) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing_Count': missing_apps,
    'Percentage': missing_pct
}).sort_values('Missing_Count', ascending=False)
print(missing_df[missing_df['Missing_Count'] > 0])

print("\n=== Missing Values in Reviews Dataset ===")
missing_reviews = df_reviews.isnull().sum()
missing_pct_rev = (missing_reviews / len(df_reviews) * 100).round(2)
missing_df_rev = pd.DataFrame({
    'Missing_Count': missing_reviews,
    'Percentage': missing_pct_rev
}).sort_values('Missing_Count', ascending=False)
print(missing_df_rev[missing_df_rev['Missing_Count'] > 0])

# Duplicate rows
print(f"\n=== Duplicate Rows ===")
print(f"Apps duplicates: {df_apps.duplicated().sum()}")
print(f"Reviews duplicates: {df_reviews.duplicated().sum()}")

=== Missing Values in Apps Dataset ===
                Missing_Count  Percentage
Rating                   1474       13.60
Current Ver                 8        0.07
Android Ver                 3        0.03
Content Rating              1        0.01
Type                        1        0.01

=== Missing Values in Reviews Dataset ===
                        Missing_Count  Percentage
Translated_Review               26868       41.79
Sentiment_Polarity              26863       41.78
Sentiment                       26863       41.78
Sentiment_Subjectivity          26863       41.78

=== Duplicate Rows ===
Apps duplicates: 483
Reviews duplicates: 33616


### 7. Unique Values Analysis

In [7]:
# Categorical columns unique values
print("=== Unique Values in Categorical Columns ===")
categorical_cols = df_apps.select_dtypes(include='object').columns

for col in categorical_cols:
    print(f"\n{col}: {df_apps[col].nunique()} unique values")
    print(df_apps[col].value_counts().head(10))

=== Unique Values in Categorical Columns ===

App: 9660 unique values
App
ROBLOX                                               9
CBS Sports App - Scores, News, Stats & Watch Live    8
8 Ball Pool                                          7
Candy Crush Saga                                     7
ESPN                                                 7
Duolingo: Learn Languages Free                       7
Temple Run 2                                         6
Bowmasters                                           6
Helix Jump                                           6
Nick                                                 6
Name: count, dtype: int64

Category: 34 unique values
Category
FAMILY             1972
GAME               1144
TOOLS               843
MEDICAL             463
BUSINESS            460
PRODUCTIVITY        424
PERSONALIZATION     392
COMMUNICATION       387
SPORTS              384
LIFESTYLE           382
Name: count, dtype: int64

Reviews: 6002 unique values
Reviews
0    596
1

### 8. Data Type Issues

In [8]:
# Check data types
print("=== Data Types ===")
print(df_apps.dtypes)

# Identify columns that might need type conversion
print("\n=== Sample values from key columns ===")
print(f"Rating samples: {df_apps['Rating'].head(10).tolist()}")
print(f"Reviews samples: {df_apps['Reviews'].head(10).tolist()}")
print(f"Size samples: {df_apps['Size'].head(10).tolist()}")
print(f"Installs samples: {df_apps['Installs'].head(10).tolist()}")
print(f"Price samples: {df_apps['Price'].head(10).tolist()}")

=== Data Types ===
App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

=== Sample values from key columns ===
Rating samples: [4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.1, 4.4, 4.7]
Reviews samples: ['159', '967', '87510', '215644', '967', '167', '178', '36815', '13791', '121']
Size samples: ['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '19M', '29M', '33M', '3.1M']
Installs samples: ['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+', '50,000+', '50,000+', '1,000,000+', '1,000,000+', '10,000+']
Price samples: ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']


### 9. Save Checkpoint

In [9]:
# Save initial exploration results
df_apps.to_csv('checkpoint_initial_apps.csv', index=False)
df_reviews.to_csv('checkpoint_initial_reviews.csv', index=False)

print("✓ Initial exploration complete!")
print("✓ Checkpoint files saved")

✓ Initial exploration complete!
✓ Checkpoint files saved
