# Dropping Columns in a Dataframe

In [None]:
def dropNanValues(df, threshold, inplace=False):
    """
    takes dataframe "df", and returns another df with removed columns 
    that have null values more than (or equal) "threshold" number of nulls.
    """
    nullsSeries = df.isnull().sum()
    thresholdColsList = nullsSeries[nullsSeries >= threshold].index.tolist()
    return df.drop(columns=thresholdColsList, inplace=inplace)

In [None]:
def dropNanPercentages(df, threshold, inplace=False):
    """
    takes dataframe "df", and returns another df with removed columns 
    that have null values more than (or equal) "threshold" percentage of 
    the df's total rows.
    """
    df2 = df.loc[:, df.isnull().sum() < threshold*df.shape[0]]
    if (inplace):
        df = df2
    else:
        return df2

# Seeing the columns in multiple datasets

In [None]:
def datasetsCols(dfList, datasetNames, pad=""):
    """
    Takes dataframes and returns a dataframe (df) with each column having a df's
    column names. The "pad" is to make sure all lists of columns have 
    the same length padded out with "pad"
    Example
    pad = "XX", datasetNames = ["cars", "planes"]
    dfList = [carsDF, planesDF]
    returned data frame:
            cars            planes
    0       numOfWheels     numOfWings
    1       manufacturer    manufacturer
    2       make            XX
    """
    cols = []
    maxArrayLen = 0
    for i, df in enumerate(dfList):
        cols.append(df.columns.tolist())
        maxArrayLen = max(maxArrayLen, len(cols[i]))
    
    dictCsvs = {}
    for i, df in enumerate(dfList):
        cols[i] += [pad] * (maxArrayLen - len(cols[i])) #padding the lists to make them have equal lengths 
        dictCsvs.update({datasetNames[i] : cols[i]})

    return pd.DataFrame(dictCsvs)

# Seeing the nulls of each Dataset's Columns

In [None]:
def datasetsNulls(dfList, datasetNames, nullCol="nulls_", pad=""):
    """
    Use this when you want to display a column of column names,
    then a column of the null values, and repeat that for each dataset.
    Returns a dataframe
    """
    cols = []
    nulls = []
    maxArrayLen = 0
    for i, df in enumerate(dfList):
        cols.append(df.columns.tolist())
        nulls.append(df.isnull().sum().tolist())
        maxArrayLen = max(maxArrayLen, len(cols[i]))
    
    dictCsvs = {}
    for i, df in enumerate(dfList):
        cols[i] += [pad] * (maxArrayLen - len(cols[i])) #padding the lists to make them have equal lengths 
        nulls[i] += [None] *  (maxArrayLen - len(nulls[i]))
        dictCsvs[datasetNames[i]] = cols[i]
        dictCsvs[nullCol + str(i+1)] = nulls[i]
    
    return pd.DataFrame(dictCsvs)

In [None]:
def datasetsNullsSameIndex(datasetsNames, *args):
    """
    Use this when multiple datasets have a lot of columns
    with the same names.
    Returns a dataframe with the index value being the 
    columns of all datasets
    """
    dic = {}
    for i, df in enumerate(args):
        dic[datasetsNames[i]] = df.isnull().sum()
    return pd.DataFrame(dic)