# 10 minutes to pandas
- https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [1]:
import pandas as pd
import numpy as np

## 1 Object creation

- Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

- Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [3]:
dates = pd.date_range('20250801', periods=6)
dates

DatetimeIndex(['2025-08-01', '2025-08-02', '2025-08-03', '2025-08-04',
               '2025-08-05', '2025-08-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2025-08-01,-1.384407,-0.843299,-0.485552,0.388752
2025-08-02,-0.092514,-1.389728,0.25316,-0.285747
2025-08-03,-1.371091,0.649518,0.1525,0.423183
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-05,-1.139301,0.294763,0.974443,-1.154442
2025-08-06,0.696006,-0.677877,0.491656,1.038821


- Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [5]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20250801'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2025-08-01,1.0,3,test,foo
1,1.0,2025-08-01,1.0,3,train,foo
2,1.0,2025-08-01,1.0,3,test,foo
3,1.0,2025-08-01,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
""" 
df2.  #<TAB>

df2.A                  df2.bool
df2.abs                df2.boxplot
df2.add                df2.C
df2.add_prefix         df2.clip
df2.add_suffix         df2.columns
df2.align              df2.copy
df2.all                df2.count
df2.any                df2.combine
df2.append             df2.D
df2.apply              df2.describe
df2.applymap           df2.diff
df2.B                  df2.duplicated
"""

' \ndf2.  #<TAB>\n\ndf2.A                  df2.bool\ndf2.abs                df2.boxplot\ndf2.add                df2.C\ndf2.add_prefix         df2.clip\ndf2.add_suffix         df2.columns\ndf2.align              df2.copy\ndf2.all                df2.count\ndf2.any                df2.combine\ndf2.append             df2.D\ndf2.apply              df2.describe\ndf2.applymap           df2.diff\ndf2.B                  df2.duplicated\n'

## 2  Viewing data

In [8]:
#view the top and bottom rows of the frame:

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2025-08-01,-1.384407,-0.843299,-0.485552,0.388752
2025-08-02,-0.092514,-1.389728,0.25316,-0.285747
2025-08-03,-1.371091,0.649518,0.1525,0.423183
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-05,-1.139301,0.294763,0.974443,-1.154442


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-05,-1.139301,0.294763,0.974443,-1.154442
2025-08-06,0.696006,-0.677877,0.491656,1.038821


In [11]:
df.index

DatetimeIndex(['2025-08-01', '2025-08-02', '2025-08-03', '2025-08-04',
               '2025-08-05', '2025-08-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

 - DataFrame.to_numpy() gives a NumPy representation of the underlying data.
 - NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column


In [13]:
df.to_numpy()

array([[-1.38440708, -0.84329884, -0.48555246,  0.38875151],
       [-0.09251419, -1.38972811,  0.25316021, -0.28574737],
       [-1.37109096,  0.64951843,  0.15249984,  0.4231828 ],
       [ 0.02325861, -0.5946625 , -0.18031704,  1.25383846],
       [-1.13930122,  0.29476263,  0.97444294, -1.15444186],
       [ 0.69600564, -0.67787741,  0.49165635,  1.03882132]])

In [14]:
df2.to_numpy()

array([[1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [15]:
#describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.544675,-0.426881,0.200982,0.277401
std,0.872695,0.757764,0.510789,0.887035
min,-1.384407,-1.389728,-0.485552,-1.154442
25%,-1.313144,-0.801943,-0.097113,-0.117123
50%,-0.615908,-0.63627,0.20283,0.405967
75%,-0.005685,0.072406,0.432032,0.884912
max,0.696006,0.649518,0.974443,1.253838


In [16]:
#Transposing your data:
df.T

Unnamed: 0,2025-08-01 00:00:00,2025-08-02 00:00:00,2025-08-03 00:00:00,2025-08-04 00:00:00,2025-08-05 00:00:00,2025-08-06 00:00:00
A,-1.384407,-0.092514,-1.371091,0.023259,-1.139301,0.696006
B,-0.843299,-1.389728,0.649518,-0.594663,0.294763,-0.677877
C,-0.485552,0.25316,0.1525,-0.180317,0.974443,0.491656
D,0.388752,-0.285747,0.423183,1.253838,-1.154442,1.038821


In [17]:
#Sorting by an axis:
df.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D
2025-08-06,0.696006,-0.677877,0.491656,1.038821
2025-08-05,-1.139301,0.294763,0.974443,-1.154442
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-03,-1.371091,0.649518,0.1525,0.423183
2025-08-02,-0.092514,-1.389728,0.25316,-0.285747
2025-08-01,-1.384407,-0.843299,-0.485552,0.388752


In [18]:
#Sorting by an axis:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2025-08-01,0.388752,-0.485552,-0.843299,-1.384407
2025-08-02,-0.285747,0.25316,-1.389728,-0.092514
2025-08-03,0.423183,0.1525,0.649518,-1.371091
2025-08-04,1.253838,-0.180317,-0.594663,0.023259
2025-08-05,-1.154442,0.974443,0.294763,-1.139301
2025-08-06,1.038821,0.491656,-0.677877,0.696006


In [19]:
# Sorting by values:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2025-08-02,-0.092514,-1.389728,0.25316,-0.285747
2025-08-01,-1.384407,-0.843299,-0.485552,0.388752
2025-08-06,0.696006,-0.677877,0.491656,1.038821
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-05,-1.139301,0.294763,0.974443,-1.154442
2025-08-03,-1.371091,0.649518,0.1525,0.423183


## Selection
- .at, .iat, .loc and .iloc.

### Getting

- Selecting a single column, which yields a Series, equivalent to df.A:

In [20]:
df.A

2025-08-01   -1.384407
2025-08-02   -0.092514
2025-08-03   -1.371091
2025-08-04    0.023259
2025-08-05   -1.139301
2025-08-06    0.696006
Freq: D, Name: A, dtype: float64

In [21]:
df['A']

2025-08-01   -1.384407
2025-08-02   -0.092514
2025-08-03   -1.371091
2025-08-04    0.023259
2025-08-05   -1.139301
2025-08-06    0.696006
Freq: D, Name: A, dtype: float64

- Selecting via [], which slices the rows.

In [22]:
df[0:3]

Unnamed: 0,A,B,C,D
2025-08-01,-1.384407,-0.843299,-0.485552,0.388752
2025-08-02,-0.092514,-1.389728,0.25316,-0.285747
2025-08-03,-1.371091,0.649518,0.1525,0.423183


In [23]:
df['20250802':'20250805']

Unnamed: 0,A,B,C,D
2025-08-02,-0.092514,-1.389728,0.25316,-0.285747
2025-08-03,-1.371091,0.649518,0.1525,0.423183
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-05,-1.139301,0.294763,0.974443,-1.154442


### Selection by label

In [24]:
#For getting a cross section using a label:
df.loc[dates[0]]

A   -1.384407
B   -0.843299
C   -0.485552
D    0.388752
Name: 2025-08-01 00:00:00, dtype: float64

In [25]:
#Selecting on a multi-axis by label:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2025-08-01,-1.384407,-0.843299
2025-08-02,-0.092514,-1.389728
2025-08-03,-1.371091,0.649518
2025-08-04,0.023259,-0.594663
2025-08-05,-1.139301,0.294763
2025-08-06,0.696006,-0.677877


In [26]:
#Showing label slicing, both endpoints are included:
df.loc['20250802':'20250805', ['A', 'B']]

Unnamed: 0,A,B
2025-08-02,-0.092514,-1.389728
2025-08-03,-1.371091,0.649518
2025-08-04,0.023259,-0.594663
2025-08-05,-1.139301,0.294763


In [27]:
#Reduction in the dimensions of the returned object:
df.loc['20250802', ['A', 'B']]

A   -0.092514
B   -1.389728
Name: 2025-08-02 00:00:00, dtype: float64

In [28]:
#For getting a scalar value:
df.loc[dates[0], 'A']

-1.3844070779461424

In [29]:
#For getting fast access to a scalar (equivalent to the prior method):
df.at[dates[0], 'A']

-1.3844070779461424

### Selection by position
- Select via the position of the passed integers:

In [30]:
df.iloc[3]

A    0.023259
B   -0.594663
C   -0.180317
D    1.253838
Name: 2025-08-04 00:00:00, dtype: float64

In [31]:
#By integer slices, acting similar to numpy/python:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2025-08-02,-0.092514,0.25316
2025-08-03,-1.371091,0.1525
2025-08-05,-1.139301,0.974443


In [32]:
# slicing rows explicitly:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2025-08-02,-0.092514,-1.389728,0.25316,-0.285747
2025-08-03,-1.371091,0.649518,0.1525,0.423183


In [33]:
#For slicing columns explicitly:
df.iloc[1,1]

-1.389728109503204

In [34]:
#For getting fast access to a scalar (equivalent to the prior method):

df.iat[1,1]

-1.389728109503204

## Boolean indexing
- Using a single column’s values to select data.


In [35]:
df['A'] > 0

2025-08-01    False
2025-08-02    False
2025-08-03    False
2025-08-04     True
2025-08-05    False
2025-08-06     True
Freq: D, Name: A, dtype: bool

In [36]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-06,0.696006,-0.677877,0.491656,1.038821


In [37]:
 # Selecting values from a DataFrame where a boolean condition is met.

In [38]:
df[df > 0]

Unnamed: 0,A,B,C,D
2025-08-01,,,,0.388752
2025-08-02,,,0.25316,
2025-08-03,,0.649518,0.1525,0.423183
2025-08-04,0.023259,,,1.253838
2025-08-05,,0.294763,0.974443,
2025-08-06,0.696006,,0.491656,1.038821


In [39]:
#Using the isin() method for filtering:

In [48]:
df2 = df.copy
df2

<bound method NDFrame.copy of                    A         B         C         D
2025-08-01 -1.384407 -0.843299 -0.485552  0.388752
2025-08-02 -0.092514 -1.389728  0.253160 -0.285747
2025-08-03 -1.371091  0.649518  0.152500  0.423183
2025-08-04  0.023259 -0.594663 -0.180317  1.253838
2025-08-05 -1.139301  0.294763  0.974443 -1.154442
2025-08-06  0.696006 -0.677877  0.491656  1.038821>

In [49]:
#df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
#df2[df2['E'].isin(['two', 'four'])]

## Setting

- Setting a new column automatically aligns the data by the indexes.

In [50]:
 s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))

In [52]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [59]:
# Setting values by label:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D
2025-08-01,0.0,0.0,-0.485552,0.388752
2025-08-02,0.0,-1.389728,0.25316,-0.285747
2025-08-03,0.0,0.649518,0.1525,0.423183
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-05,-1.139301,0.294763,0.974443,-1.154442
2025-08-06,0.696006,-0.677877,0.491656,1.038821


In [63]:
#Setting values by position:
df.iat[5, 0] = 0
df

Unnamed: 0,A,B,C,D
2025-08-01,0.0,0.0,-0.485552,0.388752
2025-08-02,0.0,-1.389728,0.25316,-0.285747
2025-08-03,0.0,0.649518,0.1525,0.423183
2025-08-04,0.023259,-0.594663,-0.180317,1.253838
2025-08-05,-1.139301,0.294763,0.974443,-1.154442
2025-08-06,0.0,0.0,0.491656,1.038821


In [65]:
# Setting by assigning with a NumPy array:
df.loc[:, 'D'] = np.array([5] * len(df))

In [68]:
# The result of the prior setting operations.
df

Unnamed: 0,A,B,C,D
2025-08-01,0.0,0.0,-0.485552,5
2025-08-02,0.0,-1.389728,0.25316,5
2025-08-03,0.0,0.649518,0.1525,5
2025-08-04,0.023259,-0.594663,-0.180317,5
2025-08-05,-1.139301,0.294763,0.974443,5
2025-08-06,0.0,0.0,0.491656,5


In [72]:
# A where operation with setting.
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D
2025-08-01,0.0,0.0,-0.485552,5
2025-08-02,0.0,-1.389728,0.25316,5
2025-08-03,0.0,0.649518,0.1525,5
2025-08-04,0.023259,-0.594663,-0.180317,5
2025-08-05,-1.139301,0.294763,0.974443,5
2025-08-06,0.0,0.0,0.491656,5


In [74]:
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2025-08-01,0.0,0.0,-0.485552,-5
2025-08-02,0.0,-1.389728,-0.25316,-5
2025-08-03,0.0,-0.649518,-0.1525,-5
2025-08-04,-0.023259,-0.594663,-0.180317,-5
2025-08-05,-1.139301,-0.294763,-0.974443,-5
2025-08-06,0.0,0.0,-0.491656,-5
