The purpose of this notebook is to perform data cleaning on any given object which can be converted to a pandas dataframe Tasks

Removal of unwanted observations by deleting duplicate or irrelevant values (two records with identical ids)

Fixing Structural errors such as typos, misnaming (ie America and america as the list of countries)

Managing unwanted outliers (Data points more than 3 standard deviations from the mean in a normal distribution)

Handling missing data (Either drop or impute new data)

In [8]:
#Handle imports for processing and displaying data
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.preprocessing import Imputer
import scipy

In [14]:
class DataCleaner:
    """A class for cleaning databases
    This class accepts a filepath as a string as input for
    its constructor which represents the csv file to be loaded. 
    DataCleaner provides several methods for cleaning data, or 
    can be used as a default cleaner to apply basic 
    data transformations
    
    ...
    
    Attributes
    ----------
     cols_to_drop: list
        The string list of columns to be dropped by name
    index_col: str
        The index column of the dataframe
    data: pandas.DataFrame
        Data which needs to be cleaned
    
    Methods
    -------
    
    """
    
    cols_to_drop = []
    index_col = ''
    data = None
    
    #initialize DataCleaner with a string
    def __init__(self, filePath = None, cols_to_drop = None, 
                 index_col = None, automate = False):
        """
        Parameters
        ----------
        filePath : str
            The filepath to the csv file
        cols_to_drop: list
            The string list of columns to be dropped by name
        index_col: str
            The index column of the dataframe
        automate: bool
            Whether or not to try and automatically clean the data
            
        """
        self.cols_to_drop = cols_to_drop
        self.index_col = index_col
        data = pd.read_csv(self.file_path)
        if(index_col != None):
            data.set_index(index_col, inplace = True)
        if(cols_to_drop != None):
            data.drop(cols_to_drop, inplace = True, axis = 1)
        #Automatically clean the data if desired
        if(automate):
            transformColTypes(self,self.data)
            handleNulls(self,self.data)
            
    def valueToNan(self,data,value,columns):
        """
        This function converts values within columns to NaN and is used
        for invalid values which have been marked as 0, null, etc
        Parameters
        ----------
        data is a dataframe containing data
        value is the invalid value
        columns are the columns where the invalid value is located
        """
        data[[columns]] = data[[columns]].replace(value,np.NaN)
        
    def getNulls(self,data):
        return data.isnull().sum()
    
    def getInfo(self,data):
        return self.data.info()
    
    def handleNulls(self, data):
        """
        This function tries to fill null values, or drops columns with
        more than 75% missing values. Drops rows with more than 75% 
        missing values
        Parameters and gets 
        ---------
        data is a dataframe containing data
        """
        #Drop columns which have 90% NaN
        self.data.dropna(thresh=int(data.shape[0] * .9), axis=1, inplace = True)
        #Drop rows which have 90% NaN
        self.data.dropna(thresh=int(data.shape[0] * .9), axis=0, inplace = True)
        #use imputer to impute the rest of the null values with column mean
        imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 1)
        imputer = imputer.fit(self.data)
        self.data = imputer.transform(self.data)
    def transformColTypes(self, data):
        """
        This function tries to impute column types from default dtypes
        Parameters and gets 
        ---------
        data is a dataframe containing data
        """
        #First, try to infer data types to convert object type columns
        self.data = data.infer_objects()
        for column in data.columns:
            #Test for conversion to categorical type, then get dummies
            if(data[column].nunique() < 5):
                try:
                    data[column].astype('category')
                except:
                    continue
                pd.get_dummies(self.data, prefix = column, drop_first = True)
        
    def getDataFrame(self):
        return self.data
    
    
    
    

    
