In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

        
def time_series_plot(df):
    """Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency"""
    print("\nTo check time series of numeric data  by daily, monthly and yearly frequency")
    if len(df.select_dtypes(include='datetime64').columns)>0:
        for col in df.select_dtypes(include='datetime64').columns:
            for p in ['D', 'M', 'Y']:
                if p=='D':
                    print("Plotting daily data")
                elif p=='M':
                    print("Plotting monthly data")
                else:
                    print("Plotting yearly data")
                for col_num in df.select_dtypes(include=np.number).columns:
                    __ = df.copy()
                    __ = __.set_index(col)
                    __T = __.resample(p).sum()
                    ax = __T[[col_num]].plot()
                    ax.set_ylim(bottom=0)
                    ax.get_yaxis().set_major_formatter(
                    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
                    plt.show()

                    
def numeric_eda(df, hue=None):
    """Given dataframe, generate EDA of numeric data"""
    print("\nTo check: \nDistribution of numeric data")
    display(df.describe().T)
    columns = df.select_dtypes(include=np.number).columns
    figure = plt.figure(figsize=(20, 10))
    figure.add_subplot(1, len(columns), 1)
    for index, col in enumerate(columns):
        if index > 0:
            figure.add_subplot(1, len(columns), index + 1)
        sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'})
    figure.tight_layout()
    plt.show()
    
    if len(df.select_dtypes(include='category').columns) > 0:
        for col_num in df.select_dtypes(include=np.number).columns:
            for col in df.select_dtypes(include='category').columns:
                fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2)
                fig.set_xticklabels(rotation=90)
                plt.show()
    
    # Plot the pairwise joint distributions
    print("\nTo check pairwise joint distribution of numeric data")
    if hue==None:
        sns.pairplot(df.select_dtypes(include=np.number))
    else:
        sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue)
    plt.show()


def top5(df):
    """Given dataframe, generate top 5 unique values for non-numeric data"""
    columns = df.select_dtypes(include=['object', 'category']).columns
    for col in columns:
        print("Top 5 unique values of " + col)
        print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[
              :min(5, len(df[col].value_counts()))])
        print(" ")
    
    
def categorical_eda(df, hue=None):
    """Given dataframe, generate EDA of categorical data"""
    print("\nTo check: \nUnique count of non-numeric data\n")
    print(df.select_dtypes(include=['object', 'category']).nunique())
    top5(df)
    # Plot count distribution of categorical data
    for col in df.select_dtypes(include='category').columns:
        fig = sns.catplot(x=col, kind="count", data=df, hue=hue)
        fig.set_xticklabels(rotation=90)
        plt.show()
    

def eda(df):
    """Given dataframe, generate exploratory data analysis"""
    # check that input is pandas dataframe
    if type(df) != pd.core.frame.DataFrame:
        raise TypeError("Only pandas dataframe is allowed as input")
        
    # replace field that's entirely space (or empty) with NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    print("Preview of data:")
    display(df.head(3))

    print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n")
    print(df.info())

    # generate preview of entries with null values
    if df.isnull().any(axis=None):
        print("\nPreview of data with null values:")
        display(df[df.isnull().any(axis=1)].head(3))
        missingno.matrix(df)
        plt.show()

    # generate count statistics of duplicate entries
    if len(df[df.duplicated()]) > 0:
        print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
        display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
    else:
        print("\nNo duplicated entries found")

    # EDA of categorical data
    categorical_eda(df)
    
    # EDA of numeric data
    numeric_eda(df)
        
    # Plot time series plot of numeric data
    time_series_plot(df)

In [2]:
df = pd.read_csv(r'C:\Users\mohit\OneDrive\Desktop\omdena-texas-homelessness\src\data\housing\2007-2020-Hosuing Inventory Count-Counts-by-State.xlsx - 2020.csv')

In [3]:
df.head()

Unnamed: 0,State,"Total Year-Round Beds (ES, TH, SH)","Total Non-DV Year-Round Beds (ES, TH, SH)","Total HMIS Year-Round Beds (ES, TH, SH)","HMIS Participation Rate for Year-Round Beds (ES, TH, SH)",Total Year-Round Beds (ES),Total Year-Round Beds (TH),Total Year-Round Beds (SH),"Total Units for Households with Children (ES, TH, SH)","Total Beds for Households with Children (ES, TH, SH)",...,Total Year-Round Beds (OPH),Total Non-DV Year-Round Beds (OPH),Total HMIS Year-Round Beds (OPH),HMIS Participation Rate for Year-Round Beds (OPH),Total Units for Households with Children (OPH),Total Beds for Households with Children (OPH),Total Beds for Households without Children (OPH),Total Beds for Households with only Children (OPH),Dedicated Veteran Beds (OPH),Dedicated Youth Beds (OPH)
0,AK,1885,1347,1122,59.52%,1410,475,0,194,722,...,92,92,92,100.00%,4,8,84,0,0,0
1,AL,2913,2291,1659,56.95%,1992,887,34,411,1134,...,40,40,32,80.00%,0,0,40,0,0,0
2,AR,1686,1193,680,40.33%,1330,356,0,181,617,...,0,0,0,.,0,0,0,0,0,0
3,AZ,6079,4943,4352,71.59%,3632,2370,77,821,2876,...,346,346,346,100.00%,75,294,52,0,33,10
4,CA,53265,48649,35666,66.96%,38241,14760,264,7344,23569,...,8707,8618,6185,71.03%,849,2684,6011,12,464,171


In [4]:
df.isnull().sum()

State                                                       0
Total Year-Round Beds (ES, TH, SH)                          0
Total Non-DV Year-Round Beds (ES, TH, SH)                   0
Total HMIS Year-Round Beds (ES, TH, SH)                     0
HMIS Participation Rate for Year-Round Beds (ES, TH, SH)    0
                                                           ..
Total Beds for Households with Children (OPH)               0
Total Beds for Households without Children (OPH)            0
Total Beds for Households with only Children (OPH)          0
Dedicated Veteran Beds (OPH)                                0
Dedicated Youth Beds (OPH)                                  0
Length: 77, dtype: int64

In [5]:
df.nunique()

State                                                       56
Total Year-Round Beds (ES, TH, SH)                          56
Total Non-DV Year-Round Beds (ES, TH, SH)                   55
Total HMIS Year-Round Beds (ES, TH, SH)                     56
HMIS Participation Rate for Year-Round Beds (ES, TH, SH)    55
                                                            ..
Total Beds for Households with Children (OPH)               37
Total Beds for Households without Children (OPH)            43
Total Beds for Households with only Children (OPH)           6
Dedicated Veteran Beds (OPH)                                25
Dedicated Youth Beds (OPH)                                  14
Length: 77, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 77 columns):
 #   Column                                                     Non-Null Count  Dtype 
---  ------                                                     --------------  ----- 
 0   State                                                      56 non-null     object
 1   Total Year-Round Beds (ES, TH, SH)                         56 non-null     int64 
 2   Total Non-DV Year-Round Beds (ES, TH, SH)                  56 non-null     int64 
 3   Total HMIS Year-Round Beds (ES, TH, SH)                    56 non-null     int64 
 4   HMIS Participation Rate for Year-Round Beds (ES, TH, SH)   56 non-null     object
 5   Total Year-Round Beds (ES)                                 56 non-null     int64 
 6   Total Year-Round Beds (TH)                                 56 non-null     int64 
 7   Total Year-Round Beds (SH)                                 56 non-null     int64 
 8   Total Units for Househ

In [7]:
df.shape

(56, 77)

# Let's Remove Special Characters

In [8]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–","$"]

col = df.select_dtypes('object').columns

for i in range(len(spec_chars)):
    for j in range(1,5):
        df[col[j]] = df[col[j]].str.replace(spec_chars[i] , "")

In [9]:
df.head()

Unnamed: 0,State,"Total Year-Round Beds (ES, TH, SH)","Total Non-DV Year-Round Beds (ES, TH, SH)","Total HMIS Year-Round Beds (ES, TH, SH)","HMIS Participation Rate for Year-Round Beds (ES, TH, SH)",Total Year-Round Beds (ES),Total Year-Round Beds (TH),Total Year-Round Beds (SH),"Total Units for Households with Children (ES, TH, SH)","Total Beds for Households with Children (ES, TH, SH)",...,Total Year-Round Beds (OPH),Total Non-DV Year-Round Beds (OPH),Total HMIS Year-Round Beds (OPH),HMIS Participation Rate for Year-Round Beds (OPH),Total Units for Households with Children (OPH),Total Beds for Households with Children (OPH),Total Beds for Households without Children (OPH),Total Beds for Households with only Children (OPH),Dedicated Veteran Beds (OPH),Dedicated Youth Beds (OPH)
0,AK,1885,1347,1122,59.52,1410,475,0,194,722,...,92,92,92,100.00%,4,8,84,0,0,0
1,AL,2913,2291,1659,56.95,1992,887,34,411,1134,...,40,40,32,80.00%,0,0,40,0,0,0
2,AR,1686,1193,680,40.33,1330,356,0,181,617,...,0,0,0,.,0,0,0,0,0,0
3,AZ,6079,4943,4352,71.59,3632,2370,77,821,2876,...,346,346,346,100.00%,75,294,52,0,33,10
4,CA,53265,48649,35666,66.96,38241,14760,264,7344,23569,...,8707,8618,6185,71.03%,849,2684,6011,12,464,171


In [10]:
# Now we can see all those unwanted special character got removed

# Dealing with Wrong Datatypes

In [11]:
# AS we can see we have some columns which are given as object dataype but we can see all of them are of numeric 


In [12]:
col = df.select_dtypes('object').columns

for i in range(len(col)):
    df[col[i]] = pd.to_numeric(df[col[i]], errors='coerce')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 77 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   State                                                      0 non-null      float64
 1   Total Year-Round Beds (ES, TH, SH)                         56 non-null     int64  
 2   Total Non-DV Year-Round Beds (ES, TH, SH)                  56 non-null     int64  
 3   Total HMIS Year-Round Beds (ES, TH, SH)                    56 non-null     int64  
 4   HMIS Participation Rate for Year-Round Beds (ES, TH, SH)   56 non-null     float64
 5   Total Year-Round Beds (ES)                                 56 non-null     int64  
 6   Total Year-Round Beds (TH)                                 56 non-null     int64  
 7   Total Year-Round Beds (SH)                                 56 non-null     int64  
 8   Total Units 

In [14]:
# Now we can easily see that all our columns are in crrect datatype

# Let's Deal with Null Values and High Cardinality

In [15]:
# For NULL values we will keep those columns which has around 50% of NAN data

n =int(0.5 * df.shape[0])

# For Unique values we will keep only those columns which has around 25% of unique values
p = int(0.1 * df.shape[0])

col = df.columns
for i in range(len(col)):
    if df[col[i]].isnull().sum() > n:
        df.drop([col[i]] , axis= 1 , inplace= True)

# After dropping some columns we will make around list of columns
col = df.select_dtypes('object').columns

for i in range(len(col)):
    if df[col[i]].nunique() > p:
        df.drop([col[i]] , axis=1 , inplace=True)

In [16]:
df.shape

(56, 73)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 73 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Total Year-Round Beds (ES, TH, SH)                         56 non-null     int64  
 1   Total Non-DV Year-Round Beds (ES, TH, SH)                  56 non-null     int64  
 2   Total HMIS Year-Round Beds (ES, TH, SH)                    56 non-null     int64  
 3   HMIS Participation Rate for Year-Round Beds (ES, TH, SH)   56 non-null     float64
 4   Total Year-Round Beds (ES)                                 56 non-null     int64  
 5   Total Year-Round Beds (TH)                                 56 non-null     int64  
 6   Total Year-Round Beds (SH)                                 56 non-null     int64  
 7   Total Units for Households with Children (ES, TH, SH)      56 non-null     int64  
 8   Total Beds f

In [18]:
df.nunique()

Total Year-Round Beds (ES, TH, SH)                          56
Total Non-DV Year-Round Beds (ES, TH, SH)                   55
Total HMIS Year-Round Beds (ES, TH, SH)                     56
HMIS Participation Rate for Year-Round Beds (ES, TH, SH)    55
Total Year-Round Beds (ES)                                  56
                                                            ..
Total Beds for Households with Children (OPH)               37
Total Beds for Households without Children (OPH)            43
Total Beds for Households with only Children (OPH)           6
Dedicated Veteran Beds (OPH)                                25
Dedicated Youth Beds (OPH)                                  14
Length: 73, dtype: int64

In [19]:
df.head()

Unnamed: 0,"Total Year-Round Beds (ES, TH, SH)","Total Non-DV Year-Round Beds (ES, TH, SH)","Total HMIS Year-Round Beds (ES, TH, SH)","HMIS Participation Rate for Year-Round Beds (ES, TH, SH)",Total Year-Round Beds (ES),Total Year-Round Beds (TH),Total Year-Round Beds (SH),"Total Units for Households with Children (ES, TH, SH)","Total Beds for Households with Children (ES, TH, SH)","Total Beds for Households without Children (ES, TH, SH)",...,Dedicated Chronically Homeless Beds (PSH),Total Year-Round Beds (OPH),Total Non-DV Year-Round Beds (OPH),Total HMIS Year-Round Beds (OPH),Total Units for Households with Children (OPH),Total Beds for Households with Children (OPH),Total Beds for Households without Children (OPH),Total Beds for Households with only Children (OPH),Dedicated Veteran Beds (OPH),Dedicated Youth Beds (OPH)
0,1885,1347,1122,59.52,1410,475,0,194,722,1141,...,459,92,92,92,4,8,84,0,0,0
1,2913,2291,1659,56.95,1992,887,34,411,1134,1736,...,943,40,40,32,0,0,40,0,0,0
2,1686,1193,680,40.33,1330,356,0,181,617,1069,...,22,0,0,0,0,0,0,0,0,0
3,6079,4943,4352,71.59,3632,2370,77,821,2876,3170,...,1903,346,346,346,75,294,52,0,33,10
4,53265,48649,35666,66.96,38241,14760,264,7344,23569,29515,...,23391,8707,8618,6185,849,2684,6011,12,464,171


In [20]:
sol = (df == 0).sum(axis=0)

In [21]:
df['HMIS Participation Rate for Year-Round Beds (ES, TH, SH)'][df['HMIS Participation Rate for Year-Round Beds (ES, TH, SH)']==0].count()

1

In [22]:
sol

Total Year-Round Beds (ES, TH, SH)                           0
Total Non-DV Year-Round Beds (ES, TH, SH)                    1
Total HMIS Year-Round Beds (ES, TH, SH)                      1
HMIS Participation Rate for Year-Round Beds (ES, TH, SH)     1
Total Year-Round Beds (ES)                                   0
                                                            ..
Total Beds for Households with Children (OPH)               17
Total Beds for Households without Children (OPH)            13
Total Beds for Households with only Children (OPH)          51
Dedicated Veteran Beds (OPH)                                28
Dedicated Youth Beds (OPH)                                  41
Length: 73, dtype: int64

In [23]:
len(sol)

73

In [24]:
col = df.columns
n = 0.23 * df.shape[0]
for i in range(len(sol)):
    if (df[col[i]]==0).sum() > n:
        df.drop([col[i]] , axis = 1 , inplace=True)
    

In [25]:
df.shape

(56, 53)

In [26]:
df.head()

Unnamed: 0,"Total Year-Round Beds (ES, TH, SH)","Total Non-DV Year-Round Beds (ES, TH, SH)","Total HMIS Year-Round Beds (ES, TH, SH)","HMIS Participation Rate for Year-Round Beds (ES, TH, SH)",Total Year-Round Beds (ES),Total Year-Round Beds (TH),"Total Units for Households with Children (ES, TH, SH)","Total Beds for Households with Children (ES, TH, SH)","Total Beds for Households without Children (ES, TH, SH)","Total Beds for Households with only Children (ES, TH, SH)",...,Total Non-DV Year-Round Beds (PSH),Total HMIS Year-Round Beds (PSH),Total Units for Households with Children (PSH),Total Beds for Households with Children (PSH),Total Beds for Households without Children (PSH),Dedicated Veteran Beds (PSH),Dedicated Chronically Homeless Beds (PSH),Total Year-Round Beds (OPH),Total Non-DV Year-Round Beds (OPH),Total HMIS Year-Round Beds (OPH)
0,1885,1347,1122,59.52,1410,475,194,722,1141,22,...,963,478,106,318,645,453,459,92,92,92
1,2913,2291,1659,56.95,1992,887,411,1134,1736,43,...,3274,1823,311,822,2452,1501,943,40,40,32
2,1686,1193,680,40.33,1330,356,181,617,1069,0,...,1040,632,124,337,703,408,22,0,0,0
3,6079,4943,4352,71.59,3632,2370,821,2876,3170,33,...,8703,7720,702,2417,6277,2467,1903,346,346,346
4,53265,48649,35666,66.96,38241,14760,7344,23569,29515,181,...,65585,46205,6517,20342,45515,21385,23391,8707,8618,6185


In [27]:
df.to_csv(r'C:\Users\mohit\OneDrive\Desktop\omdena-texas-homelessness\src\tasks\task-2-EDA\cleaned_datasets\Final_HIC_By_LHC_2018.csv')

In [28]:
col = df.columns

In [29]:
f, ax = plt.subplots(figsize=(10, 6))
corr = df[col].corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f',
                 linewidths=.05)
f.subplots_adjust(top=0.93)
t= f.suptitle('Heatmap of Poverty Stats', fontsize=14)

Error in callback <function flush_figures at 0x000001ECF26809D0> (for post_execute):


KeyboardInterrupt: 

In [48]:
ans = df.corr()

In [49]:
ans

Unnamed: 0,"Total Year-Round Beds (ES, TH, SH)","Total Non-DV Year-Round Beds (ES, TH, SH)","Total HMIS Year-Round Beds (ES, TH, SH)","HMIS Participation Rate for Year-Round Beds (ES, TH, SH)",Total Year-Round Beds (ES),Total Year-Round Beds (TH),"Total Units for Households with Children (ES, TH, SH)","Total Beds for Households with Children (ES, TH, SH)","Total Beds for Households without Children (ES, TH, SH)","Total Beds for Households with only Children (ES, TH, SH)",...,Total Non-DV Year-Round Beds (PSH),Total HMIS Year-Round Beds (PSH),Total Units for Households with Children (PSH),Total Beds for Households with Children (PSH),Total Beds for Households without Children (PSH),Dedicated Veteran Beds (PSH),Dedicated Chronically Homeless Beds (PSH),Total Year-Round Beds (OPH),Total Non-DV Year-Round Beds (OPH),Total HMIS Year-Round Beds (OPH)
"Total Year-Round Beds (ES, TH, SH)",1.0,0.999772,0.999096,0.121148,0.998678,0.985795,0.998223,0.998765,0.99899,0.978494,...,0.993249,0.98909,0.990017,0.989358,0.994617,0.985728,0.994755,0.974105,0.974232,0.97498
"Total Non-DV Year-Round Beds (ES, TH, SH)",0.999772,1.0,0.999609,0.126361,0.999342,0.982672,0.998983,0.999329,0.998071,0.974495,...,0.991385,0.986445,0.987461,0.98681,0.993129,0.98305,0.992984,0.971691,0.971828,0.97274
"Total HMIS Year-Round Beds (ES, TH, SH)",0.999096,0.999609,1.0,0.1364,0.99973,0.978506,0.999598,0.999766,0.996374,0.971916,...,0.988411,0.983088,0.984645,0.983706,0.990263,0.978728,0.991186,0.967983,0.968134,0.969443
"HMIS Participation Rate for Year-Round Beds (ES, TH, SH)",0.121148,0.126361,0.1364,1.0,0.126516,0.101006,0.13333,0.130623,0.112446,0.084538,...,0.118473,0.117562,0.118823,0.118286,0.11861,0.105671,0.126526,0.131801,0.131767,0.131693
Total Year-Round Beds (ES),0.998678,0.999342,0.99973,0.126516,1.0,0.97586,0.999537,0.999673,0.995682,0.969424,...,0.986617,0.980795,0.982227,0.981375,0.988778,0.976383,0.989451,0.964911,0.96508,0.966624
Total Year-Round Beds (TH),0.985795,0.982672,0.978506,0.101006,0.97586,1.0,0.975443,0.97729,0.991304,0.989886,...,0.996538,0.997857,0.99711,0.997093,0.99531,0.998019,0.993614,0.986244,0.986227,0.984376
"Total Units for Households with Children (ES, TH, SH)",0.998223,0.998983,0.999598,0.13333,0.999537,0.975443,1.0,0.999864,0.994629,0.968243,...,0.986182,0.9804,0.982317,0.981114,0.988238,0.975675,0.989066,0.966475,0.966652,0.967952
"Total Beds for Households with Children (ES, TH, SH)",0.998765,0.999329,0.999766,0.130623,0.999673,0.97729,0.999864,1.0,0.995527,0.970511,...,0.987445,0.982011,0.983619,0.982547,0.989402,0.977567,0.990205,0.968183,0.968345,0.96961
"Total Beds for Households without Children (ES, TH, SH)",0.99899,0.998071,0.996374,0.112446,0.995682,0.991304,0.994629,0.995527,1.0,0.98327,...,0.996385,0.993337,0.993628,0.993371,0.997239,0.990989,0.996677,0.977282,0.977377,0.977634
"Total Beds for Households with only Children (ES, TH, SH)",0.978494,0.974495,0.971916,0.084538,0.969424,0.989886,0.968243,0.970511,0.98327,1.0,...,0.984148,0.986985,0.987222,0.985843,0.982438,0.98456,0.987437,0.972985,0.972895,0.973828


In [33]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

df.drop(to_drop, axis=1, inplace=True)

In [34]:
df.shape

(56, 4)

In [35]:
to_drop

['Total Non-DV Year-Round Beds (ES, TH, SH)',
 'Total HMIS Year-Round Beds (ES, TH, SH)',
 'Total Year-Round Beds (ES)',
 'Total Year-Round Beds (TH)',
 'Total Units for Households with Children (ES, TH, SH)',
 'Total Beds for Households with Children (ES, TH, SH)',
 'Total Beds for Households without Children (ES, TH, SH)',
 'Total Beds for Households with only Children (ES, TH, SH)',
 'Dedicated Veteran Beds (ES, TH, SH)',
 'Dedicated Youth Beds (ES, TH, SH)',
 'Total Non-DV Year-Round Beds (ES)',
 'Total HMIS Year-Round Beds (ES)',
 'HMIS Participation Rate for Year-Round Beds (ES)',
 'Total Seasonal Beds (ES)',
 'Total Overflow Beds (ES)',
 'Total Units for Households with Children (ES)',
 'Total Beds for Households with Children (ES)',
 'Total Beds for Households without Children (ES)',
 'Total Beds for Households with only Children (ES)',
 'Dedicated Veteran Beds (ES)',
 'Dedicated Youth Beds (ES)',
 'Total Non-DV Year-Round Beds (TH)',
 'Total HMIS Year-Round Beds (TH)',
 'Total

In [36]:
df.head()

Unnamed: 0,"Total Year-Round Beds (ES, TH, SH)","HMIS Participation Rate for Year-Round Beds (ES, TH, SH)",HMIS Participation Rate for Year-Round Beds (TH),HMIS Participation Rate for Year-Round Beds (SH)
0,1885,59.52,71.37,
1,2913,56.95,55.02,100.0
2,1686,40.33,67.7,
3,6079,71.59,70.21,100.0
4,53265,66.96,62.52,91.67


# Let's Perform EDA

In [21]:
# Nothing will come as we don't have any categorical columns# Nothing will come as we don't have any categorical related columns

categorical_eda(df)


To check: 
Unique count of non-numeric data

Series([], dtype: float64)


In [22]:
# Nothing will come as we don't have any categorical columns# Nothing will come as we don't have date time related columns

time_series_plot(df)


To check time series of numeric data  by daily, monthly and yearly frequency


In [23]:
top5(df)

In [24]:
!pip install pandas-profiling
from pandas_profiling import ProfileReport
ProfileReport(df)

You should consider upgrading via the 'C:\Users\mohit\anaconda3\python.exe -m pip install --upgrade pip' command.




Summarize dataset:  92%|█████████▏| 79/86 [09:23<00:49,  7.14s/it, Get scatter matrix] 


KeyboardInterrupt: 

<Figure size 576x396 with 0 Axes>

In [None]:
removing_col = []