In [1]:
import numpy as np         # Importing NumPy
import pandas as pd        # Importing Pandas
import matplotlib.pyplot as plt  # Importing Matplotlib for plotting
import seaborn as sns      # Importing Seaborn (assuming Feedbone was a typo)


### This NoteBook demonstrates how to automate data cleaning for the following tasks:
#### Task 1 : For the given columns [Rating,Size,Price] of the data frame determine the non_numeric characters
#### Task 2 : eliminate those non_numeric characters
#### Task 3 : Change the data type of the above-mentioned columns to numeric from object

In [3]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('googleplaystore.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  object 
 4   Size            10358 non-null  object 
 5   Installs        10358 non-null  object 
 6   Type            10357 non-null  object 
 7   Price           10357 non-null  float64
 8   Content Rating  10357 non-null  object 
 9   Genres          10358 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10355 non-null  object 
dtypes: float64(2), object(11)
memory usage: 1.1+ MB


In [5]:
# Drop the duplicate rows and store the result back in df
df = df.drop_duplicates()

df.head(5)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [6]:
# we need to convert the object column into numeric column 
# First we will need to eleminate all the aplhanumeric characters from the column
## For this we will have to spot the distinct alphanumeric characters in the column 
## Treate the aplhanumeric characters and then change the datatype of the column to numeric


# Task 1 : For the given columns [Rating,Size,Price] of the data frame determine the non_numeric characters

In [8]:
# make a function to detrmine the non numeric values in the columns of the df

numbers_list = ["1","2","3","4","5","6","7","8","9","0","."]
invalid_chars = []
def Invalid_chars(df,ColumnName):
    Price_val = df[ColumnName].unique()
    for x in Price_val:
        for xs in x:
            if xs not in numbers_list:
                invalid_chars.append(xs)

    invalid_chars_df = pd.DataFrame(invalid_chars,columns=["Chars"])
    invalid = invalid_chars_df["Chars"].unique() 
    print(invalid,"for Column :",ColumnName)

Invalid_chars(df,"Price")


    

['$' 'E' 'v' 'e' 'r' 'y' 'o' 'n'] for Column : Price


# Task 2 : eliminate those non_numeric characters

In [10]:
def Change_values(df, ColumnName, Invalid_chars):
    cleaned_values = []  # Temporary list to store cleaned values
    for x in df[ColumnName]:
        for xn in Invalid_chars:
            x = x.replace(xn, "")  # Replace invalid characters
        cleaned_values.append(x)  # Append cleaned value to the list
    df[ColumnName] = cleaned_values  # Update the DataFrame column with cleaned values


# Call the function
Change_values(df, "Price", ['$', 'E', 'v', 'e', 'r', 'y', 'o', 'n'])

# Check unique values after cleaning
print(df["Price"].unique())


['0' '4.99' '3.99' '6.99' '1.49' '2.99' '7.99' '5.99' '3.49' '1.99' '9.99'
 '7.49' '0.99' '9.00' '5.49' '10.00' '24.99' '11.99' '79.99' '16.99'
 '14.99' '1.00' '29.99' '12.99' '2.49' '10.99' '1.50' '19.99' '15.99'
 '33.99' '74.99' '39.99' '3.95' '4.49' '1.70' '8.99' '2.00' '3.88' '25.99'
 '399.99' '17.99' '400.00' '3.02' '1.76' '4.84' '4.77' '1.61' '2.50'
 '1.59' '6.49' '1.29' '5.00' '13.99' '299.99' '379.99' '37.99' '18.99'
 '389.99' '19.90' '8.49' '1.75' '14.00' '4.85' '46.99' '109.99' '154.99'
 '3.08' '2.59' '4.80' '1.96' '19.40' '3.90' '4.59' '15.46' '3.04' '4.29'
 '2.60' '3.28' '4.60' '28.99' '2.95' '2.90' '1.97' '200.00' '89.99' '2.56'
 '30.99' '3.61' '394.99' '1.26' '' '1.20' '1.04']


# Task 3 : Change the data type of the above-mentioned columns to numeric from object

In [12]:
df["Price"] = df["Price"].replace('', None).dropna()


In [13]:
def Change_dtype(df,ColumnName,dtype):
    print(f"Initail dtype of {ColumnName} :",df[ColumnName].dtype)
    df[ColumnName] = df[ColumnName].astype(dtype)
    print(f"New dtype of {ColumnName}:",df[ColumnName].dtype)
Change_dtype(df,"Price","float")

Initail dtype of Price : object
New dtype of Price: float64


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  object 
 4   Size            10358 non-null  object 
 5   Installs        10358 non-null  object 
 6   Type            10357 non-null  object 
 7   Price           10357 non-null  float64
 8   Content Rating  10357 non-null  object 
 9   Genres          10358 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10355 non-null  object 
dtypes: float64(2), object(11)
memory usage: 1.1+ MB
