In [1]:
import numpy as np         # Importing NumPy
import pandas as pd        # Importing Pandas
import matplotlib.pyplot as plt  # Importing Matplotlib for plotting
import seaborn as sns      # Importing Seaborn (assuming Feedbone was a typo)


In [2]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('googleplaystore.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [4]:
# Drop the duplicate rows and store the result back in df
df = df.drop_duplicates()

df.head(5)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Determined Class (Data Cleaner) with three funcctions for the following tasks:
#### Task 1 : For the given columns [Rating,Size,Price,Installs] of the data frame determine the non_numeric characters
#### Task 2 : eliminate those non_numeric characters
#### Task 3 : Change the data type of the above-mentioned columns to numeric from object


In [6]:
class DataCleaner:
    def __init__(self, df):
        """
        Initialize the DataCleaner with a DataFrame.
        """
        self.df = df

    def find_invalid_chars(self, column_name, valid_chars):
        """
        Find invalid characters in a column based on the valid characters list.

        Parameters:
            column_name (str): The column to inspect.
            valid_chars (list): List of valid characters.

        Returns:
            pd.DataFrame: DataFrame of invalid characters.
            list: List of invalid characters found.
        """
        self.invalid_chars = []
        unique_values = self.df[column_name].unique()
        
        for value in unique_values:
            for char in str(value):  # Convert to string to handle non-string values
                if char not in valid_chars:
                    self.invalid_chars.append(char)
        
        # Remove duplicates
        self.invalid_chars = list(set(self.invalid_chars))
        
        # Create a DataFrame for invalid characters
        invalid_chars_df = pd.DataFrame(self.invalid_chars, columns=["Values"])
        
        return invalid_chars_df, self.invalid_chars

    def clean_invalid_chars(self, column_name):
        """
        Remove invalid characters from a column.

        Parameters:
            column_name (str): The column to clean.

        Returns:
            list: Cleaned column values.
        """
        cleaned_column = []
        for value in self.df[column_name]:
            for char in str(value):  # Convert to string to handle non-string values
                if char in self.invalid_chars:
                    value = str(value).replace(char, "")
            cleaned_column.append(value)
        
        # Update the column in the DataFrame
        self.df[column_name] = cleaned_column
        return self.df[column_name]

    def convert_to_float(self, column_name):
        """
        Replace empty strings with NaN and convert the column to float.

        Parameters:
            column_name (str): The column to convert.

        Returns:
            pd.Series: The converted column.
        """
        self.df[column_name] = self.df[column_name].replace('', np.nan).astype(float)
        return self.df[column_name]

    def get_dataframe(self):
        """
        Return the cleaned DataFrame.

        Returns:
            pd.DataFrame: The cleaned DataFrame.
        """
        return self.df


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  object 
 4   Size            10358 non-null  object 
 5   Installs        10358 non-null  object 
 6   Type            10357 non-null  object 
 7   Price           10358 non-null  object 
 8   Content Rating  10357 non-null  object 
 9   Genres          10358 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10355 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


#### Calling the functions :

In [29]:
# Initialize the DataCleaner
cleaner = DataCleaner(df)

# Step 1: Find invalid characters
valid_chars = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "."]
invalid_chars_df, invalid_chars = cleaner.find_invalid_chars("Installs", valid_chars)
print("Invalid Characters:", invalid_chars_df)

# Step 2: Clean invalid characters
cleaner.clean_invalid_chars("Installs")

# Step 3: Convert to float
cleaner.convert_to_float("Installs")

# View the cleaned DataFrame
print(cleaner.get_dataframe())

Invalid Characters:   Values
0      r
1      ,
2      F
3      +
4      e
                                                     App             Category  \
0         Photo Editor & Candy Camera & Grid & ScrapBook       ART_AND_DESIGN   
1                                    Coloring book moana       ART_AND_DESIGN   
2      U Launcher Lite – FREE Live Cool Themes, Hide ...       ART_AND_DESIGN   
3                                  Sketch - Draw & Paint       ART_AND_DESIGN   
4                  Pixel Draw - Number Art Coloring Book       ART_AND_DESIGN   
...                                                  ...                  ...   
10836                                   Sya9a Maroc - FR               FAMILY   
10837                   Fr. Mike Schmitz Audio Teachings               FAMILY   
10838                             Parkinson Exercices FR              MEDICAL   
10839                      The SCP Foundation DB fr nn5n  BOOKS_AND_REFERENCE   
10840      iHoroscope - 2018 Daily 

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10358 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  object 
 4   Size            8832 non-null   float64
 5   Installs        10357 non-null  float64
 6   Type            10357 non-null  object 
 7   Price           10357 non-null  float64
 8   Content Rating  10357 non-null  object 
 9   Genres          10358 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10355 non-null  object 
dtypes: float64(4), object(9)
memory usage: 1.1+ MB


# All the rewuired columns Ratings Installs, Price and Installs have been succefully changed to data type Float fron Object

# Thankyou