# 10 minutes to pandas
- https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [1]:
import pandas as pd
import numpy as np

## 1 Object creation

- Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

- Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [3]:
dates = pd.date_range('20250801', periods=6)
dates

DatetimeIndex(['2025-08-01', '2025-08-02', '2025-08-03', '2025-08-04',
               '2025-08-05', '2025-08-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2025-08-01,0.816304,-0.171653,-0.402198,-0.026276
2025-08-02,0.623686,0.677355,0.281635,-1.098074
2025-08-03,-0.089521,-0.388019,0.709028,0.041834
2025-08-04,1.312554,-0.128138,1.329118,0.964053
2025-08-05,1.302831,0.497011,0.095922,0.241019
2025-08-06,-1.097444,-0.515955,-1.191667,0.014842


- Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [5]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20250801'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2025-08-01,1.0,3,test,foo
1,1.0,2025-08-01,1.0,3,train,foo
2,1.0,2025-08-01,1.0,3,test,foo
3,1.0,2025-08-01,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
""" 
df2.  #<TAB>

df2.A                  df2.bool
df2.abs                df2.boxplot
df2.add                df2.C
df2.add_prefix         df2.clip
df2.add_suffix         df2.columns
df2.align              df2.copy
df2.all                df2.count
df2.any                df2.combine
df2.append             df2.D
df2.apply              df2.describe
df2.applymap           df2.diff
df2.B                  df2.duplicated
"""

' \ndf2.  #<TAB>\n\ndf2.A                  df2.bool\ndf2.abs                df2.boxplot\ndf2.add                df2.C\ndf2.add_prefix         df2.clip\ndf2.add_suffix         df2.columns\ndf2.align              df2.copy\ndf2.all                df2.count\ndf2.any                df2.combine\ndf2.append             df2.D\ndf2.apply              df2.describe\ndf2.applymap           df2.diff\ndf2.B                  df2.duplicated\n'

## 2  Viewing data

In [8]:
#view the top and bottom rows of the frame:

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2025-08-01,0.816304,-0.171653,-0.402198,-0.026276
2025-08-02,0.623686,0.677355,0.281635,-1.098074
2025-08-03,-0.089521,-0.388019,0.709028,0.041834
2025-08-04,1.312554,-0.128138,1.329118,0.964053
2025-08-05,1.302831,0.497011,0.095922,0.241019


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2025-08-04,1.312554,-0.128138,1.329118,0.964053
2025-08-05,1.302831,0.497011,0.095922,0.241019
2025-08-06,-1.097444,-0.515955,-1.191667,0.014842


In [11]:
df.index

DatetimeIndex(['2025-08-01', '2025-08-02', '2025-08-03', '2025-08-04',
               '2025-08-05', '2025-08-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

 - DataFrame.to_numpy() gives a NumPy representation of the underlying data.
 - NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column


In [13]:
df.to_numpy()

array([[ 0.81630384, -0.17165263, -0.40219836, -0.02627632],
       [ 0.62368645,  0.6773545 ,  0.28163541, -1.09807372],
       [-0.08952142, -0.3880185 ,  0.70902768,  0.04183396],
       [ 1.31255353, -0.12813836,  1.32911792,  0.9640535 ],
       [ 1.30283135,  0.49701149,  0.09592217,  0.24101909],
       [-1.09744411, -0.51595546, -1.19166693,  0.01484178]])

In [14]:
df2.to_numpy()

array([[1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2025-08-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [17]:
#describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.478068,-0.0049,0.136973,0.0229
std,0.929151,0.483392,0.874878,0.662234
min,-1.097444,-0.515955,-1.191667,-1.098074
25%,0.088781,-0.333927,-0.277668,-0.015997
50%,0.719995,-0.149895,0.188779,0.028338
75%,1.181199,0.340724,0.60218,0.191223
max,1.312554,0.677355,1.329118,0.964053


In [20]:
#Transposing your data:
df.T

Unnamed: 0,2025-08-01 00:00:00,2025-08-02 00:00:00,2025-08-03 00:00:00,2025-08-04 00:00:00,2025-08-05 00:00:00,2025-08-06 00:00:00
A,0.816304,0.623686,-0.089521,1.312554,1.302831,-1.097444
B,-0.171653,0.677355,-0.388019,-0.128138,0.497011,-0.515955
C,-0.402198,0.281635,0.709028,1.329118,0.095922,-1.191667
D,-0.026276,-1.098074,0.041834,0.964053,0.241019,0.014842


In [24]:
#Sorting by an axis:
df.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D
2025-08-06,-1.097444,-0.515955,-1.191667,0.014842
2025-08-05,1.302831,0.497011,0.095922,0.241019
2025-08-04,1.312554,-0.128138,1.329118,0.964053
2025-08-03,-0.089521,-0.388019,0.709028,0.041834
2025-08-02,0.623686,0.677355,0.281635,-1.098074
2025-08-01,0.816304,-0.171653,-0.402198,-0.026276


In [25]:
#Sorting by an axis:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2025-08-01,-0.026276,-0.402198,-0.171653,0.816304
2025-08-02,-1.098074,0.281635,0.677355,0.623686
2025-08-03,0.041834,0.709028,-0.388019,-0.089521
2025-08-04,0.964053,1.329118,-0.128138,1.312554
2025-08-05,0.241019,0.095922,0.497011,1.302831
2025-08-06,0.014842,-1.191667,-0.515955,-1.097444


In [26]:
# Sorting by values:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2025-08-06,-1.097444,-0.515955,-1.191667,0.014842
2025-08-03,-0.089521,-0.388019,0.709028,0.041834
2025-08-01,0.816304,-0.171653,-0.402198,-0.026276
2025-08-04,1.312554,-0.128138,1.329118,0.964053
2025-08-05,1.302831,0.497011,0.095922,0.241019
2025-08-02,0.623686,0.677355,0.281635,-1.098074


## Selection
- .at, .iat, .loc and .iloc.

### Getting

- Selecting a single column, which yields a Series, equivalent to df.A:

In [28]:
df.A

2025-08-01    0.816304
2025-08-02    0.623686
2025-08-03   -0.089521
2025-08-04    1.312554
2025-08-05    1.302831
2025-08-06   -1.097444
Freq: D, Name: A, dtype: float64

In [29]:
df['A']

2025-08-01    0.816304
2025-08-02    0.623686
2025-08-03   -0.089521
2025-08-04    1.312554
2025-08-05    1.302831
2025-08-06   -1.097444
Freq: D, Name: A, dtype: float64

- Selecting via [], which slices the rows.

In [31]:
df[0:3]

Unnamed: 0,A,B,C,D
2025-08-01,0.816304,-0.171653,-0.402198,-0.026276
2025-08-02,0.623686,0.677355,0.281635,-1.098074
2025-08-03,-0.089521,-0.388019,0.709028,0.041834


In [32]:
df['20250802':'20250805']

Unnamed: 0,A,B,C,D
2025-08-02,0.623686,0.677355,0.281635,-1.098074
2025-08-03,-0.089521,-0.388019,0.709028,0.041834
2025-08-04,1.312554,-0.128138,1.329118,0.964053
2025-08-05,1.302831,0.497011,0.095922,0.241019


### Selection by label

In [35]:
#For getting a cross section using a label:
df.loc[dates[0]]

A    0.816304
B   -0.171653
C   -0.402198
D   -0.026276
Name: 2025-08-01 00:00:00, dtype: float64

In [37]:
#Selecting on a multi-axis by label:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2025-08-01,0.816304,-0.171653
2025-08-02,0.623686,0.677355
2025-08-03,-0.089521,-0.388019
2025-08-04,1.312554,-0.128138
2025-08-05,1.302831,0.497011
2025-08-06,-1.097444,-0.515955


In [41]:
#Showing label slicing, both endpoints are included:
df.loc['20250802':'20250805', ['A', 'B']]

Unnamed: 0,A,B
2025-08-02,0.623686,0.677355
2025-08-03,-0.089521,-0.388019
2025-08-04,1.312554,-0.128138
2025-08-05,1.302831,0.497011


In [43]:
#Reduction in the dimensions of the returned object:
df.loc['20250802', ['A', 'B']]

A    0.623686
B    0.677355
Name: 2025-08-02 00:00:00, dtype: float64

In [45]:
#For getting a scalar value:
df.loc[dates[0], 'A']

0.816303840967799

In [47]:
#For getting fast access to a scalar (equivalent to the prior method):
df.at[dates[0], 'A']

0.816303840967799

### Selection by position
- Select via the position of the passed integers:

In [49]:
df.iloc[3]

A    1.312554
B   -0.128138
C    1.329118
D    0.964053
Name: 2025-08-04 00:00:00, dtype: float64

In [50]:
#By integer slices, acting similar to numpy/python:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2025-08-02,0.623686,0.281635
2025-08-03,-0.089521,0.709028
2025-08-05,1.302831,0.095922


In [51]:
# slicing rows explicitly:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2025-08-02,0.623686,0.677355,0.281635,-1.098074
2025-08-03,-0.089521,-0.388019,0.709028,0.041834


In [52]:
#For slicing columns explicitly:
df.iloc[1,1]

0.6773545024201504

In [54]:
#For getting fast access to a scalar (equivalent to the prior method):

df.iat[1,1]

0.6773545024201504