In [2]:
import pandas as pd
import numpy as np

# --- Load the Raw Data ---
try:
    df = pd.read_csv(r'Data\googleplaystore.csv')
    print("Successfully loaded googleplaystore.csv.")
    print(f"Initial shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'googleplaystore.csv' not found. Please make sure it's in the same directory.")
    # Exit or raise an error if the file is not found
    exit()

# --- Subtask 1: Handle Missing Values ---
print("\nCleaning missing values...")
# Fill missing 'Rating' values with the mean of the column
mean_rating = df['Rating'].mean()
df['Rating'].fillna(mean_rating, inplace=True)

# Fill missing 'Type' with the most common value (mode)
mode_type = df['Type'].mode()[0]
df['Type'].fillna(mode_type, inplace=True)

# Drop rows with any remaining missing values in critical columns
df.dropna(inplace=True)


# --- Subtask 2: Clean and Convert Data Types ---
print("Cleaning and converting data types...")
# Clean 'Price' column
df['Price'] = df['Price'].apply(lambda x: str(x).replace('$', '') if '$' in str(x) else str(x))
df['Price'] = df['Price'].apply(lambda x: float(x))

# Clean 'Reviews' column
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')

# Clean 'Installs' column
df['Installs'] = df['Installs'].apply(lambda x: str(x).replace('+', '').replace(',', ''))
df['Installs'] = df['Installs'].apply(lambda x: int(x))

# Clean 'Size' column (convert M to KB for consistency)
def clean_size(size):
    if 'M' in size:
        return float(size.replace('M', '')) * 1024
    elif 'k' in size:
        return float(size.replace('k', ''))
    return np.nan # Varies with device will become NaN

df['Size'] = df['Size'].apply(clean_size)
# Fill 'Varies with device' sizes with the mean size
df['Size'].fillna(df['Size'].mean(), inplace=True)


# --- Subtask 3: Remove Duplicates ---
print("Removing duplicate apps...")
df.drop_duplicates(subset=['App'], keep='first', inplace=True)


# --- Subtask 4: Save the Cleaned Dataset ---
output_filename = r'output\cleaned_google_play.csv'
df.to_csv(output_filename, index=False)
print(f"\nSuccessfully cleaned the dataset and saved it as '{output_filename}'.")
print(f"Final shape: {df.shape}")

Successfully loaded googleplaystore.csv.
Initial shape: (10841, 13)

Cleaning missing values...
Cleaning and converting data types...
Removing duplicate apps...

Successfully cleaned the dataset and saved it as 'output\cleaned_google_play.csv'.
Final shape: (9649, 13)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(mean_rating, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Type'].fillna(mode_type, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw