In [1]:
## We will continue using the same DataFrames as in the previous tutorial. 
## Therefore you can continue in the same Notebook. 
## If you decide to create a new one, don't forget to import the packages and create the same df and df2.

In [6]:
import pandas as pd
import numpy as np

In [8]:
 dates = pd.date_range('20130101', periods=6)

In [9]:
 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [10]:
df2 = pd.DataFrame({'A': 1.,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3] * 4, dtype='int32'),
                        'E': pd.Categorical(["test", "train", "test", "train"]),
                        'F': 'foo'})

In [11]:
## Select a single column.

df['A']


2013-01-01    1.158399
2013-01-02    1.799858
2013-01-03    0.100585
2013-01-04    1.775564
2013-01-05   -0.563927
2013-01-06    1.610593
Freq: D, Name: A, dtype: float64

In [12]:
## And this is equivalent to:

df.A
## The first of the above two options is recommended because it avoids possible conflicts with any of the DataFrame methods.

2013-01-01    1.158399
2013-01-02    1.799858
2013-01-03    0.100585
2013-01-04    1.775564
2013-01-05   -0.563927
2013-01-06    1.610593
Freq: D, Name: A, dtype: float64

In [13]:
## Selecting via [], which slices the rows.

In [14]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.158399,-1.671541,1.211521,-0.641387
2013-01-02,1.799858,-0.833345,-0.430721,0.37968
2013-01-03,0.100585,-0.549307,-1.066296,0.826547


In [15]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.799858,-0.833345,-0.430721,0.37968
2013-01-03,0.100585,-0.549307,-1.066296,0.826547
2013-01-04,1.775564,0.750531,-0.511792,0.96296


In [16]:
### Selection by Label

## Select the first row based on its index value.

In [17]:
df.loc["2013-01-01"]

A    1.158399
B   -1.671541
C    1.211521
D   -0.641387
Name: 2013-01-01 00:00:00, dtype: float64

In [18]:
## Select more than one column by their column names.

In [19]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,1.158399,-1.671541
2013-01-02,1.799858,-0.833345
2013-01-03,0.100585,-0.549307
2013-01-04,1.775564,0.750531
2013-01-05,-0.563927,-0.747463
2013-01-06,1.610593,-1.757082


In [20]:
## : represents that we want to take all the rows as well as the list, ['A', 'B'], which represents the columns.

In [21]:
## We can also use label slicing and include both endpoints:

df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,1.799858,-0.833345
2013-01-03,0.100585,-0.549307
2013-01-04,1.775564,0.750531


In [22]:
## The command above will return the Series and the one below will return just a scalar value, which is simply, a number.

In [23]:
df.loc[dates[0], 'A']

1.1583992878028146

In [24]:
### Selection by Position

## We can also select based on the actual position in DataFrame

In [25]:
df.iloc[3]

A    1.775564
B    0.750531
C   -0.511792
D    0.962960
Name: 2013-01-04 00:00:00, dtype: float64

In [26]:
## We can use slicing as well. This approach is similar to Numpy/ Python style.

In [27]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.775564,0.750531
2013-01-05,-0.563927,-0.747463


In [28]:
## If we want to take all rows, we use : again.

In [29]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,1.799858,-0.833345,-0.430721,0.37968
2013-01-03,0.100585,-0.549307,-1.066296,0.826547


In [30]:
## Similarly, we can use : to take all the columns as well.

In [31]:
### Selection by dtype


## The select_dtypes() method implements subsetting of columns based on their dtype. 
## By subsetting, we mean taking only the selection of columns based on their dtype.

In [32]:
 df = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                       'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3),
                       'category': pd.Series(list("ABC")).astype('category')})

In [33]:
## Select only bool columns from df above.

df.select_dtypes(include=[bool])


Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


In [34]:
### Boolean Indexing

## In this section, we will use columns' values to filter data.

In [35]:
## Take the rows where column A is higher than 0.

## Before you look at the command below, use google to find the command. You will see how easy it is to search for stuff to do in Pandas.

In [36]:
df[df['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2022-11-29 23:46:18.063260,B
2,c,3,5,6.0,True,False,2022-11-30 23:46:18.063260,C


In [37]:
## We can also use function isin() for filtering.

In [38]:
# 1) Create a copy of df and store it in variable df2.
# 2) Create a new column E in the DataFrame df2 with values ['one', 'one', 'two'].

In [39]:
df2 = df.copy()

In [40]:
df2['E'] = ['one', 'two', 'three']

In [41]:
## Now we can use function isin() to take only rows where E is two or four.

df2[df2['E'].isin(['one','two'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2022-11-28 23:46:18.063260,A,one
1,b,2,4,5.0,False,True,2022-11-29 23:46:18.063260,B,two


In [42]:
## We can also set values in the DataFrame.

## Setting values by position:

In [43]:
df.iat[0, 1] = -1

In [44]:
## or 
df.iloc[0,1] = 2

In [45]:
## Setting values by label:

df.at[0, 'float64'] = -10


In [46]:
## or

df.loc[0, 'float64'] = -20

In [48]:
## Setting by assigning with a NumPy array:

df.loc[:, 'uint8'] = np.array([50] * len(df))

  df.loc[:, 'uint8'] = np.array([50] * len(df))


In [49]:
## The length of the array on the right sight of = needs to be the same as the length of the object on the left.