# Imports

In [1]:
import os
import sys
import time
import sklearn
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',0)
pd.set_option('display.max_colwidth',0)
InteractiveShell.ast_node_interactivity = "all"

print(f"Time: {time.ctime()}")
print(f"Py Version: {sys.version}")
print(f"Py Executable: {sys.executable}")
print(f"Operation Sys: {sys.platform}")
print(f"Working Directory: {os.getcwd()}")
print(f"Numpy: {np.__version__}, Pandas: {pd.__version__}, Sklearn: {sklearn.__version__}")

Time: Thu Dec 16 23:40:28 2021
Py Version: 3.6.10 |Anaconda, Inc.| (default, May  7 2020, 19:46:08) [MSC v.1916 64 bit (AMD64)]
Py Executable: C:\Users\Dakshayani\.conda\envs\atlantis_anaconda_venv\python.exe
Operation Sys: win32
Working Directory: E:\Git Rajiv2806\Python-Code-Helpers
Numpy: 1.19.2, Pandas: 1.1.5, Sklearn: 0.23.2


In [1]:
def df_info_func(df, vizualize=False, threshold=3, display_sample=True, sample_count = 5,side_by_side=True):
    def f1(series):
        try: return str(series.unique().tolist())
        except: return '--'

    print(f'Rows: {df.shape[0]} N Cols: {df.shape[1]}')

    if df.shape[0] > 0:
        df_info = pd.DataFrame(index=df.columns)
        df_info['data_types'] = df.dtypes.values
        df_info['n_missing'] = df.isna().sum().values    
        df_info['missing_pct'] = round((df_info['n_missing'] / len(df))*100,2)

        df_ = df.astype(str).copy()

        df_info['n_unique'] = df_.apply(lambda x: x.nunique(),axis=0).values
        df_info['uniq_vals'] = df_.apply(lambda x: np.where(x.nunique() <= threshold, f1(x), '--')).values

        for c in df.columns:            
            if df[c].dtype == 'datetime64[ns]':
                dt_str = "From: " + df[c].min().strftime('%d-%b-%Y')
                dt_str += " Till: " + df[c].max().strftime('%d-%b-%Y')
                df_info.loc[c,'uniq_vals'] = dt_str                
            elif df[c].dtype == 'float64' or  df[c].dtype == 'int64':
                real_num_series = df[c].dropna()
                if len(real_num_series) > 0:
                    dt_str = "Min: " + str(int(real_num_series.min()))
                    dt_str += " Max: " + str(int(real_num_series.max()))
                    dt_str += " Mean: " + str(int(real_num_series.mean()))
                    dt_str += " Med: " + str(int(real_num_series.median()))
                    dt_str += " Std: " + str(int(real_num_series.std()))
                else:
                    dt_str = ""
                df_info.loc[c,'uniq_vals'] = dt_str
            else:
                pass
                
        df_info.reset_index(inplace=True)
        df_info.rename(columns={'index':'cols'},inplace=True)
        if vizualize:
            # import seaborn as sns
            # sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
            # or an another package to display missing values graph
            import missingno as msno 
            msno.matrix(df)

        sample_df = pd.concat([df.head(sample_count),df.tail(sample_count)])

        if side_by_side:
            from IPython.display import display_html
            space = "\xa0" * 10
            info_styler = df_info.style.set_table_attributes("style='display:inline'").set_caption('DF Info')
            df_styler = sample_df.style.set_table_attributes("style='display:inline'").set_caption('DF')
            display_html(info_styler._repr_html_() + space + df_styler._repr_html_(),raw=True)            
        else:
            from IPython.core.display import HTML
            display(HTML(df_info.to_html()))
            if display_sample:
                display(HTML(sample_df.to_html()))            
    else:
        print("Empty DataFrame")

# Example

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/jorisvandenbossche/pandas-tutorial/master/data/titanic.csv")
df_info_func(df)

Rows: 891 N Cols: 12


Unnamed: 0,cols,data_types,n_missing,missing_pct,n_unique,uniq_vals
0,PassengerId,int64,0,0.0,891,Min: 1 Max: 891 Mean: 446 Med: 446 Std: 257
1,Survived,int64,0,0.0,2,Min: 0 Max: 1 Mean: 0 Med: 0 Std: 0
2,Pclass,int64,0,0.0,3,Min: 1 Max: 3 Mean: 2 Med: 3 Std: 0
3,Name,object,0,0.0,891,--
4,Sex,object,0,0.0,2,"['male', 'female']"
5,Age,float64,177,19.87,89,Min: 0 Max: 80 Mean: 29 Med: 28 Std: 14
6,SibSp,int64,0,0.0,7,Min: 0 Max: 8 Mean: 0 Med: 0 Std: 1
7,Parch,int64,0,0.0,7,Min: 0 Max: 6 Mean: 0 Med: 0 Std: 0
8,Ticket,object,0,0.0,681,--
9,Fare,float64,0,0.0,248,Min: 0 Max: 512 Mean: 32 Med: 14 Std: 49

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q
