# Pandas Methods / Functions

[Cookbook](https://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html)

DataFrame()

DataFrame.drop()

DataFrame.drop_duplicates()

DataFrame.groupby()

DataFrame.head()

DataFrame.idxmax()

DataFrame.idxmin()

DataFrame.info()

DataFrame.iloc[]

DataFrame.isnull()

DataFrame.loc[]

DataFrame.mean()

DataFrame.pivot_table()

DataFrame.replace()

DataFrame.shift()

DataFrame.sort_values()

DataFrame.sub()

DataFrame.sum()

DataFrame.value_counts()

### Pandas Series Methods

Series.between()

Series.cumsum()

Series.isnull()

Series.map()

Series.nlargest()

Series.shift()

### Pandas Namespace Functions

pd.DataFrame()

pd.cut()

pd.show_versions()

### Pandas Index & MultiIndex Features

.index

.index.tolist()

.idxmax()

.idxmin()

### Boolean / Selection Operations

(These aren’t “functions” but they ARE core methods used)

Boolean filtering:

df[df['col'] > x]

df[(condition) & (condition)]

df[(condition) | (condition)]

Comparison operators: ==, !=, <, >, <=, >=

### NumPy Functions

np.nan

np.arange()

np.r_[]

np.random.random()

np.random.RandomState()

np.random.randint()

### Indexing & Utility

.tolist()

.nonzero()

np.searchsorted()

### Everything Used Inside GroupBy

.groupby(...).sum()

.groupby(...).mean()

.groupby(...).cumcount()

.groupby(...).cumsum()

.groupby(...).nlargest()

### Special Pandas Operations

unstack()

sort_values()

drop_duplicates(keep=False)

duplicated()

cumsum(axis=1)

pd.DataFrame()

In [2]:
import pandas as pd
import numpy as np

Constructing DataFrame from a dictionary including Series:

In [12]:
d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
pd.DataFrame(data=d, index=[0, 1, 2, 3])

Unnamed: 0,col1,col2
0,0,
1,1,
2,2,2.0
3,3,3.0


In [3]:
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                   columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


Constructing DataFrame from dataclass:


In [4]:
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


Constructing DataFrame from Series/DataFrame:

In [48]:
# ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
# df = pd.DataFrame(data=ser, index=["a", "c"])
# df

df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
df2 = pd.DataFrame(df1, index=["a", "c"])
df2

Unnamed: 0,x
a,1
c,3


DataFrame.index

In [35]:
df = pd.DataFrame({'Name': ['Hassam', 'Ahmad', 'Venomous'],
                   'Age': [24, 24, 24],
                   'Location': ['Lahore', 'New York', 'Kona']},
                  index=([10, 20, 30]))
df.index = [123, 456, 789]
df

Unnamed: 0,Name,Age,Location
123,Hassam,24,Lahore
456,Ahmad,24,New York
789,Venomous,24,Kona


In [49]:
df2.reindex(['a', 'c', 'g', 'h'])

Unnamed: 0,x
a,1.0
c,3.0
g,
h,


DataFrame.columns

In [52]:
print(df.columns)
print(df2.columns)

Index(['Name', 'Age', 'Location'], dtype='object')
Index(['x'], dtype='object')


pandas.DataFrame.dtypes

In [6]:
df = pd.DataFrame({'float': [1.0],
                   'int': [1],
                   'datetime': [pd.Timestamp('20180310')],
                   'string': ['Helllloooooooooooo']})

df

Unnamed: 0,float,int,datetime,string
0,1.0,1,2018-03-10,Helllloooooooooooo


DataFrame.select_dtypes

In [8]:
df.select_dtypes(exclude='int')

Unnamed: 0,float,datetime,string
0,1.0,2018-03-10,Helllloooooooooooo


In [11]:
df.to_numpy #df.columns #df.values

<bound method DataFrame.to_numpy of    float  int   datetime              string
0    1.0    1 2018-03-10  Helllloooooooooooo>

DataFrame.axes

In [12]:
df.axes

[RangeIndex(start=0, stop=1, step=1),
 Index(['float', 'int', 'datetime', 'string'], dtype='object')]

DataFrame.ndim

In [13]:
df.ndim

2

DataFrame.size[source]

Return an int representing the number of elements in this object.

In [14]:
df.size

4

DataFrame.shape

In [15]:
df.shape

(1, 4)

DataFrame.memory_usage(index=True, deep=False)

Return the memory usage of each column in bytes.

In [19]:
df.memory_usage(index=True, deep=True)

Index       132
float         8
int           8
datetime      8
string       67
dtype: int64

#### DataFrame.drop()

drop columns by name:	        df[df.columns.difference([...])]

drop columns by position:	    df.drop(df.columns[[1,3]])

drop rows by condition:	        df[df['A'] > 0]

drop rows by index:	            df.loc[~df.index.isin([1,2])]

drop NA values:	                df.dropna()

In [15]:
df = pd.DataFrame(np.arange(12).reshape(3, 4), columns=['A', 'B', 'C', 'D'])
df.drop(columns=['D'], axis=1)

Unnamed: 0,A,B,C
0,0,1,2
1,4,5,6
2,8,9,10


df.drop_duplicates()

Alternatives:

df.groupby(["A","B"]).first()/last()

np.unique(df.values, axis=0)

In [62]:
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'glass', 'cup', 'pack', 'pack'],
    'rating': [4, 4.5, 3.5, 15, 5]
})

# df.drop_duplicates(subset = ['A'], keep = first|last|False, inplace = True, ignore_index = False)
df

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,glass,4.5
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [27]:
df.drop_duplicates(subset=["style"], keep = 'first')


Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,glass,4.5
3,Indomie,pack,15.0


#### DataFrame.groupby()

groupby() is an operation that:

(1) Splits: Break the data into groups based on one or more keys.

(2) Applies: Perform an operation on each group (sum, mean, custom function, etc.).

(3) Combines: Merge the results back into a new object.

DataFrame.groupby(by=None, axis=<no_default>, level=None, as_index=True, sort=True, group_keys=True, observed=<no_default>, dropna=True)

df.groupby("col").sum()

df.groupby("col").mean()

df.groupby("col").max()

df.groupby("col").min()

df.groupby("col").count()

df.groupby("col").agg(['mean','max','min'])

df.groupby("col").apply(custom_func)

df.groupby("col").transform(custom_func)

df.groupby("col").filter(custom_func)

In [61]:
df = pd.DataFrame({
    "team": ["A","A","B","B","B","C"],
    "points": [5,7,3,4,10,8]
})

df.groupby("team")["points"].sum()

team
A    12
B    17
C     8
Name: points, dtype: int64

In [48]:
arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
          ['Captive', 'Wild', 'Captive', 'Wild']]
index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, index=index)
df.groupby(by='Type').mean()

Unnamed: 0_level_0,Max Speed
Type,Unnamed: 1_level_1
Captive,210.0
Wild,185.0


DataFrame.head()

In [55]:
df.head(-1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Max Speed
Animal,Type,Unnamed: 2_level_1
Falcon,Captive,390.0
Falcon,Wild,350.0
Parrot,Captive,30.0


DataFrame.tail()

In [59]:
df.tail(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Max Speed
Animal,Type,Unnamed: 2_level_1
Falcon,Captive,390.0
Falcon,Wild,350.0
Parrot,Captive,30.0
Parrot,Wild,20.0


DataFrame.idxmax()

In [34]:
arr = np.random.RandomState(30).randint(1, 101, size=(8, 8))
df = pd.DataFrame(arr, columns=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,a,b,c,d,e,f,g,h
0,38,38,46,46,13,24,3,54
1,18,47,4,42,8,66,50,46
2,62,36,19,19,77,17,7,63
3,28,47,46,65,63,12,16,24
4,14,51,34,56,29,59,92,79
5,58,76,96,45,38,76,58,40
6,10,34,48,40,37,23,41,26
7,55,70,91,27,79,92,20,31


In [78]:
df.idxmax(axis=1, skipna=True)

0    h
1    f
2    e
3    d
4    g
5    c
6    c
7    f
dtype: object

DataFrame.idxmin()

In [77]:
df.idxmin(axis=1)

0    g
1    c
2    g
3    f
4    a
5    e
6    a
7    g
dtype: object

Just a practice one

In [84]:
mylist = [3,4,5,6,7]
for i in mylist:
    mylist.remove(i)
    print(i)
print(mylist)

3
5
7
[4, 6]


DataFrame.loc[]

In [None]:
df.loc[[0,1,3,4], ['a','b']]
# df.loc[0]

Unnamed: 0,a,b
0,38,38
1,18,47
3,28,47
4,14,51


DataFrame.iloc[]

In [117]:
df.iloc[[0, 7]]

Unnamed: 0,a,b,c,d,e,f,g,h
0,38,38,46,46,13,24,3,54
7,55,70,91,27,79,92,20,31


In [None]:
df.iloc[[True, False, True, True, False, True, True, False]]

Unnamed: 0,a,b,c,d,e,f,g,h
0,38,38,46,46,13,24,3,54
2,62,36,19,19,77,17,7,63
3,28,47,46,65,63,12,16,24
5,58,76,96,45,38,76,58,40
6,10,34,48,40,37,23,41,26


DataFrame.isnull()

In [46]:
df = pd.DataFrame({
    "A": [1, None, None],
    "B": [np.nan, 2, 3],
})

df

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,,3.0


In [47]:
df.isnull()# df.notnull()

Unnamed: 0,A,B
0,False,True
1,True,False
2,True,False


DataFrame.isna()

In [48]:
df.isna() #df.notna

Unnamed: 0,A,B
0,False,True
1,True,False
2,True,False


In [73]:
df.isna().sum()

A    2
B    1
dtype: int64

np.nan, NONE, pd.NA, pd.Nat

In [32]:
# # a = np.nan
# # type(a)
# a = None
# type(a)

a = pd.DataFrame([2]) + pd.NA
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       0 non-null      object
dtypes: object(1)
memory usage: 140.0+ bytes


DataFrame.pivot()

DataFrame.pivot(index=None, columns=None, values=None)

In [76]:
df = pd.DataFrame({
       "lev1": [1, 1, 1, 2, 2, 2],
       "lev2": [1, 1, 2, 1, 1, 2],
       "lev3": [1, 2, 1, 2, 1, 2],
       "lev4": [1, 2, 3, 4, 5, 6],
       "values": [0, 1, 2, 3, 4, 5]})
df

Unnamed: 0,lev1,lev2,lev3,lev4,values
0,1,1,1,1,0
1,1,1,2,2,1
2,1,2,1,3,2
3,2,1,2,4,3
4,2,1,1,5,4
5,2,2,2,6,5


In [84]:
df.pivot(index='lev3', columns=['lev1', 'lev4'], values='values')


lev1,1,1,1,2,2,2
lev4,1,2,3,4,5,6
lev3,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,0.0,,2.0,,4.0,
2,,1.0,,3.0,,5.0
