In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

        
def time_series_plot(df):
    """Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency"""
    print("\nTo check time series of numeric data  by daily, monthly and yearly frequency")
    if len(df.select_dtypes(include='datetime64').columns)>0:
        for col in df.select_dtypes(include='datetime64').columns:
            for p in ['D', 'M', 'Y']:
                if p=='D':
                    print("Plotting daily data")
                elif p=='M':
                    print("Plotting monthly data")
                else:
                    print("Plotting yearly data")
                for col_num in df.select_dtypes(include=np.number).columns:
                    __ = df.copy()
                    __ = __.set_index(col)
                    __T = __.resample(p).sum()
                    ax = __T[[col_num]].plot()
                    ax.set_ylim(bottom=0)
                    ax.get_yaxis().set_major_formatter(
                    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
                    plt.show()

                    
def numeric_eda(df, hue=None):
    """Given dataframe, generate EDA of numeric data"""
    print("\nTo check: \nDistribution of numeric data")
    display(df.describe().T)
    columns = df.select_dtypes(include=np.number).columns
    figure = plt.figure(figsize=(20, 10))
    figure.add_subplot(1, len(columns), 1)
    for index, col in enumerate(columns):
        if index > 0:
            figure.add_subplot(1, len(columns), index + 1)
        sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'})
    figure.tight_layout()
    plt.show()
    
    if len(df.select_dtypes(include='category').columns) > 0:
        for col_num in df.select_dtypes(include=np.number).columns:
            for col in df.select_dtypes(include='category').columns:
                fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2)
                fig.set_xticklabels(rotation=90)
                plt.show()
    
    # Plot the pairwise joint distributions
    print("\nTo check pairwise joint distribution of numeric data")
    if hue==None:
        sns.pairplot(df.select_dtypes(include=np.number))
    else:
        sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue)
    plt.show()


def top5(df):
    """Given dataframe, generate top 5 unique values for non-numeric data"""
    columns = df.select_dtypes(include=['object', 'category']).columns
    for col in columns:
        print("Top 5 unique values of " + col)
        print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[
              :min(5, len(df[col].value_counts()))])
        print(" ")
    
    
def categorical_eda(df, hue=None):
    """Given dataframe, generate EDA of categorical data"""
    print("\nTo check: \nUnique count of non-numeric data\n")
    print(df.select_dtypes(include=['object', 'category']).nunique())
    top5(df)
    # Plot count distribution of categorical data
    for col in df.select_dtypes(include='category').columns:
        fig = sns.catplot(x=col, kind="count", data=df, hue=hue)
        fig.set_xticklabels(rotation=90)
        plt.show()

def profiling_report(df):
    !pip install pandas-profiling
    from pandas_profiling import ProfileReport
    ProfileReport(df)
    

def eda(df):
    """Given dataframe, generate exploratory data analysis"""
    # check that input is pandas dataframe
    if type(df) != pd.core.frame.DataFrame:
        raise TypeError("Only pandas dataframe is allowed as input")
        
    # replace field that's entirely space (or empty) with NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    print("Preview of data:")
    display(df.head(3))

    print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n")
    print(df.info())

    # generate preview of entries with null values
    if df.isnull().any(axis=None):
        print("\nPreview of data with null values:")
        display(df[df.isnull().any(axis=1)].head(3))
        missingno.matrix(df)
        plt.show()

    # generate count statistics of duplicate entries
    if len(df[df.duplicated()]) > 0:
        print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
        display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
    else:
        print("\nNo duplicated entries found")

    # EDA of categorical data
    categorical_eda(df)
    
    # EDA of numeric data
    numeric_eda(df)
        
    # Plot time series plot of numeric data
    time_series_plot(df)

    #helps to make a full report
    profiling_report(df)

In [2]:
df = pd.read_csv(r'C:\Users\mohit\OneDrive\Desktop\omdena-texas-homelessness\src\data\housing\cc-est2019-alldata-48.csv')

In [3]:
df.head()

Unnamed: 0,SUMLEV,STATE,COUNTY,STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,...,HWAC_MALE,HWAC_FEMALE,HBAC_MALE,HBAC_FEMALE,HIAC_MALE,HIAC_FEMALE,HAAC_MALE,HAAC_FEMALE,HNAC_MALE,HNAC_FEMALE
0,50,48,1,Texas,Anderson County,1,0,58458,35521,22937,...,5784,2707,321,129,182,88,188,34,82,28
1,50,48,1,Texas,Anderson County,1,1,3135,1566,1569,...,385,313,22,27,13,9,8,7,3,2
2,50,48,1,Texas,Anderson County,1,2,3258,1729,1529,...,375,324,17,16,12,11,1,7,9,5
3,50,48,1,Texas,Anderson County,1,3,3156,1573,1583,...,283,315,21,17,11,11,3,3,2,4
4,50,48,1,Texas,Anderson County,1,4,3107,1676,1431,...,324,255,7,12,11,13,4,1,2,2


In [4]:
df.isnull().sum()

SUMLEV         0
STATE          0
COUNTY         0
STNAME         0
CTYNAME        0
              ..
HIAC_FEMALE    0
HAAC_MALE      0
HAAC_FEMALE    0
HNAC_MALE      0
HNAC_FEMALE    0
Length: 80, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57912 entries, 0 to 57911
Data columns (total 80 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   SUMLEV        57912 non-null  int64 
 1   STATE         57912 non-null  int64 
 2   COUNTY        57912 non-null  int64 
 3   STNAME        57912 non-null  object
 4   CTYNAME       57912 non-null  object
 5   YEAR          57912 non-null  int64 
 6   AGEGRP        57912 non-null  int64 
 7   TOT_POP       57912 non-null  int64 
 8   TOT_MALE      57912 non-null  int64 
 9   TOT_FEMALE    57912 non-null  int64 
 10  WA_MALE       57912 non-null  int64 
 11  WA_FEMALE     57912 non-null  int64 
 12  BA_MALE       57912 non-null  int64 
 13  BA_FEMALE     57912 non-null  int64 
 14  IA_MALE       57912 non-null  int64 
 15  IA_FEMALE     57912 non-null  int64 
 16  AA_MALE       57912 non-null  int64 
 17  AA_FEMALE     57912 non-null  int64 
 18  NA_MALE       57912 non-null  int64 
 19  NA_F

In [6]:
df.nunique()

SUMLEV            1
STATE             1
COUNTY          254
STNAME            1
CTYNAME         254
               ... 
HIAC_FEMALE    1152
HAAC_MALE       688
HAAC_FEMALE     694
HNAC_MALE       340
HNAC_FEMALE     328
Length: 80, dtype: int64

In [7]:
col = df.columns

for i in range(len(col)):
    if df[col[i]].nunique() == 1:
        df.drop([col[i]],axis=1,inplace=True)

In [8]:
df.shape

(57912, 77)

In [9]:
categorical_eda(df)


To check: 
Unique count of non-numeric data

CTYNAME    254
dtype: int64
Top 5 unique values of CTYNAME
           CTYNAME  Count
0  Anderson County    228
1   Navarro County    228
2  Maverick County    228
3    Medina County    228
4    Menard County    228
 


In [10]:
df.shape

(57912, 77)

In [11]:
# For NULL values we will keep those columns which has around 50% of NAN data

n =int(0.5 * df.shape[0])

# For Unique values we will keep only those columns which has around 25% of unique values
p = int(0.1 * df.shape[0])

col = df.columns
for i in range(len(col)):
    if df[col[i]].isnull().sum() > n:
        df.drop([col[i]] , axis= 1 , inplace= True)

# After dropping some columns we will make around list of columns
col = df.select_dtypes('object').columns

for i in range(len(col)):
    if df[col[i]].nunique() > p:
        df.drop([col[i]] , axis=1 , inplace=True)

In [12]:
df.shape

(57912, 77)

In [13]:
df['TOT_MALE']

0        35521
1         1566
2         1729
3         1573
4         1676
         ...  
57907      272
57908      203
57909      135
57910       82
57911       76
Name: TOT_MALE, Length: 57912, dtype: int64