In [5]:
pip install pandas numpy scikit-learn matplotlib seaborn


Collecting pandas
  Using cached pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Using cached numpy-2.3.4-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached 

In [6]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

# Step 2: Load the user reviews dataset
df_reviews = pd.read_csv('../data/googleplaystore_user_reviews.csv')
df_reviews.head()


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [7]:
# Step 3: Check null values
df_reviews.isnull().sum()


App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

In [8]:
# Step 4: Handle missing values
# Since sentiment polarity and subjectivity are numeric — fill with mean or drop rows
df_reviews = df_reviews.dropna(subset=['Translated_Review'])  # drop rows where review text is missing
df_reviews['Sentiment_Polarity'] = df_reviews['Sentiment_Polarity'].fillna(df_reviews['Sentiment_Polarity'].mean())
df_reviews['Sentiment_Subjectivity'] = df_reviews['Sentiment_Subjectivity'].fillna(df_reviews['Sentiment_Subjectivity'].mean())

# Check again
df_reviews.isnull().sum()


App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

In [9]:
# Step 5: Remove duplicates
df_reviews = df_reviews.drop_duplicates()
df_reviews.shape


(29692, 5)

In [10]:
# Step 6: Save the cleaned file
df_reviews.to_csv('../data/cleaned_user_reviews.csv', index=False)
print("✅ Cleaned user reviews data saved successfully!")

✅ Cleaned user reviews data saved successfully!


In [11]:
# Step 7: Load the main app dataset
df_apps = pd.read_csv('../data/googleplaystore.csv')
df_apps.head()


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [12]:
# Step 8: Check for missing values
df_apps.isnull().sum()


App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [13]:
# Step 9: Drop missing and duplicate rows
df_apps = df_apps.drop_duplicates()
df_apps = df_apps.dropna(subset=['Rating', 'Category', 'Reviews', 'Installs'])
df_apps.shape

(8893, 13)

In [14]:
# Step 10: Clean numeric columns safely

# Clean 'Installs' column
df_apps['Installs'] = (
    df_apps['Installs']
    .astype(str)  # ensure string type
    .str.replace('[+,]', '', regex=True)  # remove '+' and ','
)
# Convert to numeric, invalid ones become NaN
df_apps['Installs'] = pd.to_numeric(df_apps['Installs'], errors='coerce')

# Clean 'Reviews' column
df_apps['Reviews'] = pd.to_numeric(df_apps['Reviews'], errors='coerce')

# Clean 'Price' column
df_apps['Price'] = (
    df_apps['Price']
    .astype(str)
    .str.replace('$', '', regex=True)
)
df_apps['Price'] = pd.to_numeric(df_apps['Price'], errors='coerce')

# Optional: fill NaN values with 0 if you want all numeric
df_apps[['Installs', 'Reviews', 'Price']] = df_apps[['Installs', 'Reviews', 'Price']].fillna(0)

# Check types and preview cleaned data
print(df_apps.dtypes)
print(df_apps[['Installs', 'Reviews', 'Price']].head())


App                object
Category           object
Rating            float64
Reviews           float64
Size               object
Installs          float64
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object
     Installs   Reviews  Price
0     10000.0     159.0    0.0
1    500000.0     967.0    0.0
2   5000000.0   87510.0    0.0
3  50000000.0  215644.0    0.0
4    100000.0     967.0    0.0


In [15]:
# Step 11: Check for invalid or extreme ratings
df_apps = df_apps[(df_apps['Rating'] >= 0) & (df_apps['Rating'] <= 5)]


In [16]:
# Step 12: Save cleaned dataset
df_apps.to_csv('../data/cleaned_googleplaystore.csv', index=False)
print("✅ Cleaned main app dataset saved successfully!")


✅ Cleaned main app dataset saved successfully!
