In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pandas_profiling

In [2]:
pprint = lambda st : print(st.expandtabs(32))

# Series

In [3]:
numbers = [4, 1, 2]
types = ['Gold', 'Silver', 'Bronze']
tqdm.pandas()
series = pd.Series(numbers, index=types, name='Medals')
pprint(f"index \t {series.index}")
pprint(f"attrs \t {series.dtype}")
pprint(f"to_list() \t {series.to_list()}")

index                            Index(['Gold', 'Silver', 'Bronze'], dtype='object')
attrs                            int64
to_list()                        [4, 1, 2]


# Dataframe

In [17]:
def display_side(*args):
    from IPython.display import display_html
    html_str = ''
    for df in args:
        if isinstance(df, pd.Series):
            df = pd.DataFrame(df)
        if not isinstance(df, pd.DataFrame):
            html_str += df.render()
        else:
            html_str += df.to_html()
    display_html(html_str.replace('table', 'table style="display:inline"'), raw=True)

def style():
    df = pd.DataFrame(np.random.randint(50, 150, (3, 4)))
    display(df.style.hide_index(), df.style.set_caption("Hello"))
    display_side(df, 
                 df.style.set_properties(**{'text-align':'left'}),\
                 df.style.set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))
style()

0,1,2,3
142,76,63,91
127,131,103,116
119,83,131,121


Unnamed: 0,0,1,2,3
0,142,76,63,91
1,127,131,103,116
2,119,83,131,121


Unnamed: 0,0,1,2,3
0,142,76,63,91
1,127,131,103,116
2,119,83,131,121

Unnamed: 0,0,1,2,3
0,142,76,63,91
1,127,131,103,116
2,119,83,131,121

Unnamed: 0,0,1,2,3
0,142,76,63,91
1,127,131,103,116
2,119,83,131,121


In [97]:
def modify_df():
    df = pd.DataFrame(np.random.randint(0, 10, (3, 4)), index=[f'r{i}' for i in range(3)], columns=[f'c{i}' for i in range(4)])

    df = df.progress_apply(lambda x: x**2)
    display_side(df, pd.DataFrame(df.iloc[0].describe()), df.iloc[:, 0])


    df['list'] = df.apply(lambda row: row.to_list(), axis = 1)
    df['average'] = df.mean(numeric_only=True, axis=1)

    print("value_count, select row, avarge < 27")
    display_side(df['c0'].value_counts(), df[df.index == 'r0'], df[df.average < 27])

    print("nunique \t drop duplicate in c0")
    display_side(df.drop('list', axis=1).nunique(), df.drop_duplicates(subset=['c0']))

    print("na")
    df = pd.DataFrame({'key': ['K0', None, 'K2'], 'A': ['A0', 'A1', None]}).set_index('key')
    display_side(df, df.loc[df.index.dropna()], df.loc[df.A.notnull()])
modify_df()

100%|██████████| 4/4 [00:00<00:00, 2160.90it/s]


Unnamed: 0,c0,c1,c2,c3
r0,81,16,81,9
r1,36,16,4,49
r2,64,1,1,1

Unnamed: 0,r0
count,4.0
mean,46.75
std,39.651608
min,9.0
25%,14.25
50%,48.5
75%,81.0
max,81.0

Unnamed: 0,c0
r0,81
r1,36
r2,64


value_count, select row, avarge < 27


Unnamed: 0,c0
64,1
81,1
36,1

Unnamed: 0,c0,c1,c2,c3,list,average
r0,81,16,81,9,"[81, 16, 81, 9]",46.75

Unnamed: 0,c0,c1,c2,c3,list,average
r1,36,16,4,49,"[36, 16, 4, 49]",26.25
r2,64,1,1,1,"[64, 1, 1, 1]",16.75


nunique 	 drop duplicate in c0


Unnamed: 0,0
c0,3
c1,2
c2,3
c3,3
average,3

Unnamed: 0,c0,c1,c2,c3,list,average
r0,81,16,81,9,"[81, 16, 81, 9]",46.75
r1,36,16,4,49,"[36, 16, 4, 49]",26.25
r2,64,1,1,1,"[64, 1, 1, 1]",16.75


na


Unnamed: 0_level_0,A
key,Unnamed: 1_level_1
K0,A0
,A1
K2,

Unnamed: 0_level_0,A
key,Unnamed: 1_level_1
K0,A0
K2,

Unnamed: 0_level_0,A
key,Unnamed: 1_level_1
K0,A0
,A1


## Iterate

In [61]:
def save_load():
    rank_df = pd.DataFrame([['tom', 8], ['nick', 7], ['juli', 6]] , columns=['name', 'sc'], index =['1s', '2n', '3d']) 
    display(rank_df)
    
    rank_df.to_pickle('data/rank.pkl')
    rank_df.to_csv('data/rank.csv', header=False, index=False)
    display_side(pd.read_pickle(open('data/rank.pkl', 'rb')), pd.read_csv(open('data/rank.csv', 'rb'), header=None, names=["name", "score"]))
save_load()

Unnamed: 0,name,sc
1s,tom,8
2n,nick,7
3d,juli,6


Unnamed: 0,name,sc
1s,tom,8
2n,nick,7
3d,juli,6

Unnamed: 0,name,score
0,tom,8
1,nick,7
2,juli,6


## duplicate, NaN

In [62]:
def join():
    df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
    other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'B': ['B0', 'B1', 'B2']})
    display_side(df, other, df.set_index('key').join(other.set_index('key')))
join()

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K2,A2
3,K3,A3
4,K4,A4
5,K5,A5

Unnamed: 0,key,B
0,K0,B0
1,K1,B1
2,K2,B2

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


## Memory

In [70]:
display(df.info(verbose=False, memory_usage="deep"))
# display(df.profile_report().to_file(output_file="abc_pandas_profiling.html"))

for dtype in ['float64','int64', 'object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    display(selected_dtype)
    dtype_usage = selected_dtype.memory_usage(deep=True).sum()
    pprint(f"\nAverage memory usage for {dtype} columns: \t {dtype_usage/1000**2:.2f} MB")

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, r 0 to r 2
Columns: 6 entries, c 0 to average
dtypes: float64(1), int64(4), object(1)
memory usage: 540.0 bytes


None

Unnamed: 0,average
r 0,8.5
r 1,40.75
r 2,36.75



Average memory usage for float64 columns:                        0.00 MB


Unnamed: 0,c 0,c 1,c 2,c 3
r 0,1,1,16,16
r 1,1,81,0,81
r 2,1,64,81,1



Average memory usage for int64 columns:                          0.00 MB


Unnamed: 0,list
r 0,"[1, 1, 16, 16]"
r 1,"[1, 81, 0, 81]"
r 2,"[1, 64, 81, 1]"



Average memory usage for object columns:                         0.00 MB
