In [18]:
import pandas as pd
import numpy as np
import math
import matplotlib
import csv
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
from matplotlib import pyplot
import pandas_profiling
%matplotlib inline

In [14]:
def save_to_csv(d,file_name):
    """
        Writes contents of the given dict into a csv file
        Ex: d= {'Name':138,'Population':5,...}
        
        Args:
            d (dict)      : dictionary containing key value pair where key is the 
                            feature name and value is the number of missing zeros or nans
            filename(str) : output file name (absolute path)
    """
    with open(file_name, 'w') as fp:
        fwriter = csv.writer(fp)
        for val in d.items():
            fwriter.writerow(val)

def count_negatives(df,features=None):
    """
        Calculate the number of negative values given features of the dataframe
    
        Args:
            df (Pandas Dataframe)  : A pandas dataframe 
            features(list) : list of features for which zero count needs to be calculated
        Returns:
            dict : dictionary containing key value pair where key is the 
                   feature name and value is the number of missing zeros 
    """
    if not features:
        features = df.columns.values
    

    return (df.loc[:, df.dtypes != object] < 0).sum(axis=0).to_dict() 


#     return (df[features] < 0).sum(axis=0).to_dict() 
            
def count_zeros(df,features=None):
    """
        Calculate the number of zeros for given features of the dataframe
    
        Args:
            df (Pandas Dataframe)  : A pandas dataframe 
            features(list) : list of features for which zero count needs to be calculated
        Returns:
            dict : dictionary containing key value pair where key is the 
                   feature name and value is the number of missing zeros 
    """
    if not features:
        features = df.columns.values
    return (df[features] == 0).sum(axis=0).to_dict() 

def count_nans(df,features=None):
    """
        Calculate the number of nans for given features of the dataframe
    
        Args:
            df (Pandas Dataframe)  : A pandas dataframe 
            features(list) : list of features for which nan count needs to be calculated
        Returns:
            dict : dictionary containing key value pair where key is the 
                   feature name and value is the number of missing nans 
    """
    if not features:
        features = df.columns.values
    nan_count = (len(df) - df[features].count())
    
    return nan_count.to_dict()

def drop_features(df, threshold=0.5):
    """
       Drop the features/columns from the dataframe based on the threshold.
    
        Args:
            df (Pandas Dataframe)  : A pandas dataframe 
            threshold(int [0,1]) : 
        Return:
            df (Pandas Dataframe)  : A pandas dataframe with columns with percentage of 
                                     null/nan > threshold 
            
    """
    total_instances=len(df)
    print(threshold,total_instances)
    
    to_drop=[]
    to_retain=[]
    zero_count = get_zero_count(df)
    nan_count = get_nan_count(df)
    for key, value in zero_count.items():
        total_invalid = value + nan_count.get(key)
        if total_invalid/total_instances >= threshold:
            to_drop.append(key)
        else:
            to_retain.append(key)
    return set(to_drop), set(to_retain)
        
def get_stats(df,features=[]):
    d={}
    if not len(features):
        features = df.columns.tolist()
    nans = count_nans(df,features)
    zeros = count_zeros(df,features)
    negatives = count_negatives(df,features)
    total_instances = len(df)
    for feature in features:
        nan_count = nans[feature]
        zero_count = zeros[feature]
        negative_count = negatives[feature]
        nan_percent = round(nan_count / total_instances , 4)
        zero_percent = round(zero_count / total_instances, 4)
        negative_percent = round(negative_count / total_instances, 4)
        if type(df[feature]) == int:
            total_missing = nan_count 
        else:
            total_missing = nan_count + zero_count
        
        d[feature] = {'total_missing': (total_missing, total_missing/total_instances ), 'nans' : (nan_count , nan_percent), 'zeros' : (zero_count ,zero_percent),"negatives" : (negative_count, negative_percent)}
    return d   

# def load_data(path,drop_list=[]):
#     df = pd.read_csv(path)
#     if len(drop_list):
#         df = df.drop(columns=drop_list)
#     stats = get_stats(df)
    
#     return df, stats

def get_features(features, search_string):
    return [x for x in features if search_string in x.lower()]

## Observations ##

      

**1. Feature has all 0 or nans ==> Drop it **

**2. County level data has valid values except for year 2016.**

**3. 6059 records have Name as "NOT AVAILABLE" **
     

  


## TODO ## 

Fix get_stats()