### pandas.DataFrame
class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=None)

![image.png](attachment:image.png)


In [2]:
import pandas as pd

In [2]:
a  = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=a)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [3]:
df.dtypes

col1    int64
col2    int64
dtype: object

### To enforce a single dtype:

In [6]:
import numpy as np
df = pd.DataFrame(data=a, dtype=np.int8)
df.dtypes

col1    int8
col2    int8
dtype: object

### Constructing DataFrame from a dictionary including Series:

In [9]:
d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
pd.DataFrame(data=d, index=[0, 1, 2, 3])

Unnamed: 0,col1,col2
0,0,
1,1,
2,2,2.0
3,3,3.0


### Constructing DataFrame from numpy ndarray:

In [12]:
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


### Constructing DataFrame from a numpy ndarray that has labeled columns:

In [13]:
data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
df3 = pd.DataFrame(data, columns=['c', 'a'])

df3

Unnamed: 0,c,a
0,3,1
1,6,4
2,9,7


### Constructing DataFrame from dataclass:

In [14]:
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


In [15]:
df = pd.DataFrame({'float': [1.0],
                   'int': [1],
                   'datetime': [pd.Timestamp('20180310')],
                   'string': ['foo']})
df.dtypes

float              float64
int                  int64
datetime    datetime64[ns]
string              object
dtype: object

In [16]:
int_values = [1, 2, 3, 4, 5]
text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
                  "float_col": float_values})
df

Unnamed: 0,int_col,text_col,float_col
0,1,alpha,0.0
1,2,beta,0.25
2,3,gamma,0.5
3,4,delta,0.75
4,5,epsilon,1.0


### Prints information of all columns:

In [18]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   int_col    5 non-null      int64  
 1   text_col   5 non-null      object 
 2   float_col  5 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes


## Prints a summary of columns count and its dtypes but not per column information:

In [19]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 3 entries, int_col to float_col
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes


## pandas.DataFrame.select_dtypes

In [20]:
df = pd.DataFrame({'a': [1, 2] * 3,
                   'b': [True, False] * 3,
                   'c': [1.0, 2.0] * 3})


In [21]:
df.select_dtypes(include='bool')

Unnamed: 0,b
0,True
1,False
2,True
3,False
4,True
5,False


In [22]:
df.select_dtypes(include=['float64'])

Unnamed: 0,c
0,1.0
1,2.0
2,1.0
3,2.0
4,1.0
5,2.0


In [23]:
df.select_dtypes(exclude=['int64'])

Unnamed: 0,b,c
0,True,1.0
1,False,2.0
2,True,1.0
3,False,2.0
4,True,1.0
5,False,2.0


## pandas.DataFrame.values

In [24]:
df = pd.DataFrame({'age':    [ 3,  29],
                   'height': [94, 170],
                   'weight': [31, 115]})
df

Unnamed: 0,age,height,weight
0,3,94,31
1,29,170,115


In [25]:
df2 = pd.DataFrame([('parrot',   24.0, 'second'),
                    ('lion',     80.5, 1),
                    ('monkey', np.nan, None)],
                  columns=('name', 'max_speed', 'rank'))
df2.dtypes

name          object
max_speed    float64
rank          object
dtype: object

## pandas.DataFrame.axes

In [26]:
df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
df.axes

[RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], dtype='object')]

### pandas.DataFrame.ndim(Number of array dimensions.)

In [27]:
s = pd.Series({'a': 1, 'b': 2, 'c': 3})
s.ndim

1

In [28]:
df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
df.ndim

2

## pandas.DataFrame.size( Number of elements in the array.)

In [29]:
df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
df.size

4

## pandas.DataFrame.shape

In [31]:
df.shape

(2, 2)

## pandas.DataFrame.memory_usage

In [33]:
df.memory_usage()

Index           128
int64         40000
float64       40000
complex128    80000
object        40000
bool           5000
dtype: int64

In [34]:
 mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
df = pd.DataFrame(mydict)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


In [35]:
type(df.iloc[0])

pandas.core.series.Series

In [41]:
# With a list of integers.
df.iloc[0]

a    1
b    2
c    3
d    4
Name: 0, dtype: int64

In [37]:
df.iloc[[0, 1]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400


In [40]:
# With a slice object.
df.iloc[:3]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


In [42]:
# The x passed to the lambda is the DataFrame being sliced. This selects the rows whose index label even.
df.iloc[lambda x: x.index % 2 == 0]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,1000,2000,3000,4000


In [43]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
     index=['cobra', 'viper', 'sidewinder'],
     columns=['max_speed', 'shield'])
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [44]:
# Single label. Ndf.loc['viper']ote this returns the row as a Series.
df.loc['viper']

max_speed    4
shield       5
Name: viper, dtype: int64

In [45]:
# List of labels. Note using [[]] returns a DataFrame.
df.loc[['viper', 'sidewinder']]

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


In [47]:
# Setting Conditions..
df.loc[df['shield'] > 6, ['max_speed']]

Unnamed: 0,max_speed
sidewinder,7


In [48]:
# Setting values..

# Set value for all items matching the list of labels
df.loc[['viper', 'sidewinder'], ['shield']] = 50
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,50
sidewinder,7,50


In [49]:
df.get(["max_speed","shield"])

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,50
sidewinder,7,50


* The aggregation operations are always performed over an axis, either theindex (default) or the column axis. 
* This behavior is different fromnumpy aggregation functions (mean, median, prod, sum, std,var), where the default is to compute the aggregation of the flattened array,

In [50]:
df = pd.DataFrame([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9],
                   [np.nan, np.nan, np.nan]],
                  columns=['A', 'B', 'C'])

In [51]:
df.agg(['sum', 'min'])

Unnamed: 0,A,B,C
sum,12.0,15.0,18.0
min,1.0,2.0,3.0


In [52]:
df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})

Unnamed: 0,A,B
sum,12.0,
min,1.0,2.0
max,,8.0


In [53]:
# Aggregate different functions over the columns and rename the index of the resulting DataFrame.
df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))

Unnamed: 0,A,B,C
x,7.0,,
y,,2.0,
z,,,6.0


### pandas.DataFrame.transform

In [54]:
df = pd.DataFrame({'A': range(3), 'B': range(1, 4)})
df

Unnamed: 0,A,B
0,0,1
1,1,2
2,2,3


In [55]:
df.transform(lambda x: x + 1)

Unnamed: 0,A,B
0,1,2
1,2,3
2,3,4


In [56]:
df.transform([np.sqrt, np.exp])

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sqrt,exp,sqrt,exp
0,0.0,1.0,1.0,2.718282
1,1.0,2.718282,1.414214,7.389056
2,1.414214,7.389056,1.732051,20.085537


### pandas.DataFrame.groupby
* A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be used to group large amounts of data and compute operations on these groups.

In [57]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [61]:
df.groupby(by=["Max Speed"]).sum()

Unnamed: 0_level_0,Animal
Max Speed,Unnamed: 1_level_1
24.0,Parrot
26.0,Parrot
370.0,Falcon
380.0,Falcon


In [62]:
df.groupby(by=["Max Speed"], dropna=False).sum()

Unnamed: 0_level_0,Animal
Max Speed,Unnamed: 1_level_1
24.0,Parrot
26.0,Parrot
370.0,Falcon
380.0,Falcon


## pandas.DataFrame.abs
* This function only applies to elements that are all numeric.

In [63]:
s = pd.Series([-1.10, 2, -3.33, 4])
s.abs()

0    1.10
1    2.00
2    3.33
3    4.00
dtype: float64

### pandas.DataFrame.describe
* Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values

### Describing a numeric Series

In [64]:
s = pd.Series([1, 2, 3])
s.describe()

count    3.0
mean     2.0
std      1.0
min      1.0
25%      1.5
50%      2.0
75%      2.5
max      3.0
dtype: float64

## Describing a categorical Series.

In [65]:
s = pd.Series(['a', 'a', 'b', 'c'])
s.describe()

count     4
unique    3
top       a
freq      2
dtype: object

### Describing a DataFrame. By default only numeric fields are returned.

In [66]:
df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
                   'numeric': [1, 2, 3],
                   'object': ['a', 'b', 'c']
                  })
df.describe()

Unnamed: 0,numeric
count,3.0
mean,2.0
std,1.0
min,1.0
25%,1.5
50%,2.0
75%,2.5
max,3.0


### Describing all columns of a DataFrame regardless of data type.

In [67]:
df.describe(include='all') 

Unnamed: 0,categorical,numeric,object
count,3,3.0,3
unique,3,,3
top,d,,b
freq,1,,1
mean,,2.0,
std,,1.0,
min,,1.0,
25%,,1.5,
50%,,2.0,
75%,,2.5,


# pandas.DataFrame.drop
####  Drop specified labels from rows or columns.


In [68]:
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [69]:
df.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [70]:
df.drop(columns=['B', 'C'])

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


## pandas.DataFrame.drop_duplicates

#### DataFrame with duplicates removed or None if inplace=True.

In [71]:
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})
df

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [72]:
df.drop_duplicates()

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [73]:
# To remove duplicates on specific column(s), use subset.
df.drop_duplicates(subset=['brand'])


Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5


In [74]:
# To remove duplicates and keep last occurrences, use keep.
df.drop_duplicates(subset=['brand', 'style'], keep='last')

Unnamed: 0,brand,style,rating
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
4,Indomie,pack,5.0


# pandas.DataFrame.filter
* this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index.

In [75]:
df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
                  index=['mouse', 'rabbit'],
                  columns=['one', 'two', 'three'])
df.filter(items=['one', 'three'])

Unnamed: 0,one,three
mouse,1,3
rabbit,4,6


In [76]:
# select columns by regular expression
df.filter(regex='e$', axis=1)

Unnamed: 0,one,three
mouse,1,3
rabbit,4,6


# pandas.DataFrame.rename
![image.png](attachment:image.png)

In [3]:
df1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df2 = pd.DataFrame({"c": [17,18,22]})

In [7]:
 df1.update(df2)


In [8]:
df1 = pd.DataFrame([["Emil", "Tobias", "Linus"], [16, 14, 10]])
df2 = pd.DataFrame([["Emil"], [17]])

df1.update(df2)

In [9]:
df1 = pd.DataFrame([["Emil", "Tobias", "Linus"], [16, 14, 10]])
df2 = pd.DataFrame([["Emil"], [17]])

df1.update(df2)

print(df1)

      0       1      2
0  Emil  Tobias  Linus
1    17      14     10


In [77]:
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.rename(columns={"A": "a", "B": "c"})

Unnamed: 0,a,c
0,1,4
1,2,5
2,3,6


In [78]:
# Using axis-style parameters:
df.rename(str.lower, axis='columns')

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [79]:
df.rename({1: 2, 2: 4}, axis='index')

Unnamed: 0,A,B
0,1,4
2,2,5
4,3,6


# pandas.DataFrame.from_dict
#### Creates DataFrame object from dictionary by columns or by index allowing dtype specification.
#### By default the keys of the dict become the DataFrame columns:

In [80]:
data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
pd.DataFrame.from_dict(data)

Unnamed: 0,col_1,col_2
0,3,a
1,2,b
2,1,c
3,0,d


#### Specify orient='index' to create the DataFrame using dictionary keys as rows:

In [81]:
pd.DataFrame.from_dict(data, orient='index')

Unnamed: 0,0,1,2,3
col_1,3,2,1,0
col_2,a,b,c,d


## pandas.DataFrame.append
* Append rows of other to the end of caller, returning a new object.

In [82]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y'])
df

Unnamed: 0,A,B
x,1,2
y,3,4


In [83]:
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'), index=['x', 'y'])
df.append(df2)


Unnamed: 0,A,B
x,1,2
y,3,4
x,5,6
y,7,8


## pandas.DataFrame.join
Join columns with other DataFrame either on index or on a key column. Efficiently join multiple DataFrame objects by index at once by passing a list.

In [84]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

In [85]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})

In [86]:
# Join DataFrames using their indexes.
df.join(other, lsuffix='_caller', rsuffix='_other')

Unnamed: 0,key_caller,A,key_other,B
0,K0,A0,K0,B0
1,K1,A1,K1,B1
2,K2,A2,K2,B2
3,K3,A3,,
4,K4,A4,,
5,K5,A5,,


In [88]:
'''If we want to join using the key columns, we need to set key to be the index in both df and other.
The joined DataFrame will have key as its index.'''
df.set_index('key').join(other.set_index('key'))

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


# pandas.DataFrame.merge
The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes will be ignored. 

In [89]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})

##### Merge df1 and df2 on the lkey and rkey columns. The value columns have the default suffixes, _x and _y, appended.

In [90]:
df1.merge(df2, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,value_x,rkey,value_y
0,foo,1,foo,5
1,foo,1,foo,8
2,foo,5,foo,5
3,foo,5,foo,8
4,bar,2,bar,6
5,baz,3,baz,7


#### Merge DataFrames df1 and df2 with specified left and right suffixes appended to any overlapping columns.

In [91]:
df1.merge(df2, left_on='lkey', right_on='rkey',
          suffixes=('_left', '_right'))

Unnamed: 0,lkey,value_left,rkey,value_right
0,foo,1,foo,5
1,foo,1,foo,8
2,foo,5,foo,5
3,foo,5,foo,8
4,bar,2,bar,6
5,baz,3,baz,7


In [92]:
df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
df1

Unnamed: 0,a,b
0,foo,1
1,bar,2


In [93]:
df1.merge(df2, how='inner', on='a')

Unnamed: 0,a,b,c
0,foo,1,3


In [94]:
df1.merge(df2, how='left', on='a')

Unnamed: 0,a,b,c
0,foo,1,3.0
1,bar,2,


In [95]:
df1.merge(df2, how='cross')

Unnamed: 0,a_x,b,a_y,c
0,foo,1,foo,3
1,foo,1,baz,4
2,bar,2,foo,3
3,bar,2,baz,4


### Sorting & Ranking in Pandas