# Basic data structures in pandas

>Series: a one-dimensional labeled array holding data of any type<br>
>DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns

In [7]:
import pandas as pd
import numpy as np

## Object creation

In [9]:
s=pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

#### Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns

In [13]:
dates=pd.date_range("20130101",periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df=pd.DataFrame(np.random.randn(6,4),index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.161765,-1.32688,2.539908,2.209767
2013-01-02,-1.316262,0.562353,-0.711617,0.088388
2013-01-03,0.381568,0.487226,1.768664,-0.31165
2013-01-04,0.585803,-0.615299,-1.619969,0.244345
2013-01-05,-0.829749,-0.643868,-0.45828,-0.358616
2013-01-06,0.339891,0.243163,-0.480762,-2.685433


#### Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [15]:
df2=pd.DataFrame(
    {
        "A":1.0,
        "B":pd.Timestamp("20240425"),
        "C":pd.Series(1,index=list(range(4)),dtype="float32"),
        "D":np.array([3]*4, dtype="int32"),
        "E":pd.Categorical(["test","train","test","train"]),
        "F":"Foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2024-04-25,1.0,3,test,Foo
1,1.0,2024-04-25,1.0,3,train,Foo
2,1.0,2024-04-25,1.0,3,test,Foo
3,1.0,2024-04-25,1.0,3,train,Foo


In [16]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

## Viewing data

In [19]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.161765,-1.32688,2.539908,2.209767
2013-01-02,-1.316262,0.562353,-0.711617,0.088388
2013-01-03,0.381568,0.487226,1.768664,-0.31165
2013-01-04,0.585803,-0.615299,-1.619969,0.244345
2013-01-05,-0.829749,-0.643868,-0.45828,-0.358616


In [20]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.585803,-0.615299,-1.619969,0.244345
2013-01-05,-0.829749,-0.643868,-0.45828,-0.358616
2013-01-06,0.339891,0.243163,-0.480762,-2.685433


In [21]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [22]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

Returning a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column lables

In [23]:
df.to_numpy()

array([[ 0.16176478, -1.32687987,  2.53990811,  2.20976682],
       [-1.3162622 ,  0.56235284, -0.71161684,  0.08838819],
       [ 0.38156823,  0.48722581,  1.76866377, -0.31164976],
       [ 0.58580343, -0.61529854, -1.61996907,  0.24434528],
       [-0.82974934, -0.64386835, -0.45828037, -0.35861579],
       [ 0.33989147,  0.24316275, -0.48076188, -2.6854326 ]])

> NumPy arrays have one dtype for the entire array while pandas DataFrames have one dtype per column

In [24]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.112831,-0.215551,0.172991,-0.135533
std,0.771375,0.759968,1.610689,1.567013
min,-1.316262,-1.32688,-1.619969,-2.685433
25%,-0.581871,-0.636726,-0.653903,-0.346874
50%,0.250828,-0.186068,-0.469521,-0.111631
75%,0.371149,0.42621,1.211928,0.205356
max,0.585803,0.562353,2.539908,2.209767


In [25]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.161765,-1.316262,0.381568,0.585803,-0.829749,0.339891
B,-1.32688,0.562353,0.487226,-0.615299,-0.643868,0.243163
C,2.539908,-0.711617,1.768664,-1.619969,-0.45828,-0.480762
D,2.209767,0.088388,-0.31165,0.244345,-0.358616,-2.685433


In [27]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,2.209767,2.539908,-1.32688,0.161765
2013-01-02,0.088388,-0.711617,0.562353,-1.316262
2013-01-03,-0.31165,1.768664,0.487226,0.381568
2013-01-04,0.244345,-1.619969,-0.615299,0.585803
2013-01-05,-0.358616,-0.45828,-0.643868,-0.829749
2013-01-06,-2.685433,-0.480762,0.243163,0.339891


In [28]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-01,0.161765,-1.32688,2.539908,2.209767
2013-01-05,-0.829749,-0.643868,-0.45828,-0.358616
2013-01-04,0.585803,-0.615299,-1.619969,0.244345
2013-01-06,0.339891,0.243163,-0.480762,-2.685433
2013-01-03,0.381568,0.487226,1.768664,-0.31165
2013-01-02,-1.316262,0.562353,-0.711617,0.088388


## Selection

In [30]:
dates=pd.date_range("2024/4/25", periods=10)
df=pd.DataFrame(np.random.randn(10,4),
                index=dates,columns=["A","B","C","D"])
df

Unnamed: 0,A,B,C,D
2024-04-25,-0.662813,-1.342248,0.750647,0.005331
2024-04-26,1.147856,-0.715609,-1.168268,1.10345
2024-04-27,0.166122,0.814895,0.913475,0.366533
2024-04-28,0.268073,-0.237913,2.013493,1.473262
2024-04-29,0.068212,0.182322,-1.181531,0.106862
2024-04-30,-1.313268,-0.286796,-0.196895,0.668111
2024-05-01,0.428023,-1.021844,0.693954,1.795149
2024-05-02,-0.613339,-0.381671,-1.349159,-0.861489
2024-05-03,-0.403755,1.944388,-0.008802,-0.630834
2024-05-04,-0.656848,-0.39235,0.239856,1.237207


In [31]:
s=df["A"]
s[dates[5]]

-1.313267673857093

### We can pass a list of columns to square bracket to select columns in that order.

In [32]:
df[["B","A"]]=df[["A","B"]]

In [33]:
df

Unnamed: 0,A,B,C,D
2024-04-25,-1.342248,-0.662813,0.750647,0.005331
2024-04-26,-0.715609,1.147856,-1.168268,1.10345
2024-04-27,0.814895,0.166122,0.913475,0.366533
2024-04-28,-0.237913,0.268073,2.013493,1.473262
2024-04-29,0.182322,0.068212,-1.181531,0.106862
2024-04-30,-0.286796,-1.313268,-0.196895,0.668111
2024-05-01,-1.021844,0.428023,0.693954,1.795149
2024-05-02,-0.381671,-0.613339,-1.349159,-0.861489
2024-05-03,1.944388,-0.403755,-0.008802,-0.630834
2024-05-04,-0.39235,-0.656848,0.239856,1.237207


### Attribute access

In [34]:
sa=pd.Series([1,2,3],index=list("abc"))
dfa=df.copy()

In [35]:
sa.b

2

In [36]:
dfa.A

2024-04-25   -1.342248
2024-04-26   -0.715609
2024-04-27    0.814895
2024-04-28   -0.237913
2024-04-29    0.182322
2024-04-30   -0.286796
2024-05-01   -1.021844
2024-05-02   -0.381671
2024-05-03    1.944388
2024-05-04   -0.392350
Freq: D, Name: A, dtype: float64

In [37]:
sa.a=5

In [38]:
sa

a    5
b    2
c    3
dtype: int64

In [41]:
dfa.A=list(range(len(dfa.index)))
dfa

Unnamed: 0,A,B,C,D
2024-04-25,0,-0.662813,0.750647,0.005331
2024-04-26,1,1.147856,-1.168268,1.10345
2024-04-27,2,0.166122,0.913475,0.366533
2024-04-28,3,0.268073,2.013493,1.473262
2024-04-29,4,0.068212,-1.181531,0.106862
2024-04-30,5,-1.313268,-0.196895,0.668111
2024-05-01,6,0.428023,0.693954,1.795149
2024-05-02,7,-0.613339,-1.349159,-0.861489
2024-05-03,8,-0.403755,-0.008802,-0.630834
2024-05-04,9,-0.656848,0.239856,1.237207


In [42]:
dfa["A"]=list(range(len(dfa.index))) # Creating a new column
dfa

Unnamed: 0,A,B,C,D
2024-04-25,0,-0.662813,0.750647,0.005331
2024-04-26,1,1.147856,-1.168268,1.10345
2024-04-27,2,0.166122,0.913475,0.366533
2024-04-28,3,0.268073,2.013493,1.473262
2024-04-29,4,0.068212,-1.181531,0.106862
2024-04-30,5,-1.313268,-0.196895,0.668111
2024-05-01,6,0.428023,0.693954,1.795149
2024-05-02,7,-0.613339,-1.349159,-0.861489
2024-05-03,8,-0.403755,-0.008802,-0.630834
2024-05-04,9,-0.656848,0.239856,1.237207


If you are using IPython environment, you may also use tab-completion to see those accesible attributes.

In [43]:
x=pd.DataFrame(
    {
        "x":[1,2,3],
        "y":[3,4,5],
    }
)
x.iloc[1]={
    "x":9,
    "y":99
}
x
     

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


If you try to use attribute access to create a new column, it creates a new attribute rather than a new column and will this raise a "UserWarning".

In [44]:
df_new=pd.DataFrame(
    {
        "one":[1.,2.,3.]
    }
)
df_new.two=[4,5,6]
df_new

  df_new.two=[4,5,6]


Unnamed: 0,one
0,1.0
1,2.0
2,3.0


### Slicing ranges

In [45]:
s[:5]

2024-04-25   -0.662813
2024-04-26    1.147856
2024-04-27    0.166122
2024-04-28    0.268073
2024-04-29    0.068212
Freq: D, Name: A, dtype: float64

In [46]:
s[::2]

2024-04-25   -0.662813
2024-04-27    0.166122
2024-04-29    0.068212
2024-05-01    0.428023
2024-05-03   -0.403755
Freq: 2D, Name: A, dtype: float64

In [47]:
s[::-1]

2024-05-04   -0.656848
2024-05-03   -0.403755
2024-05-02   -0.613339
2024-05-01    0.428023
2024-04-30   -1.313268
2024-04-29    0.068212
2024-04-28    0.268073
2024-04-27    0.166122
2024-04-26    1.147856
2024-04-25   -0.662813
Freq: -1D, Name: A, dtype: float64

In [48]:
s2=s.copy()
s2[:5]=0
s2

2024-04-25    0.000000
2024-04-26    0.000000
2024-04-27    0.000000
2024-04-28    0.000000
2024-04-29    0.000000
2024-04-30   -1.313268
2024-05-01    0.428023
2024-05-02   -0.613339
2024-05-03   -0.403755
2024-05-04   -0.656848
Freq: D, Name: A, dtype: float64

With DataFrame, slicing inside of [] *slices the rows*.

In [49]:
df[:3]

Unnamed: 0,A,B,C,D
2024-04-25,-1.342248,-0.662813,0.750647,0.005331
2024-04-26,-0.715609,1.147856,-1.168268,1.10345
2024-04-27,0.814895,0.166122,0.913475,0.366533


In [50]:
df[::-1]

Unnamed: 0,A,B,C,D
2024-05-04,-0.39235,-0.656848,0.239856,1.237207
2024-05-03,1.944388,-0.403755,-0.008802,-0.630834
2024-05-02,-0.381671,-0.613339,-1.349159,-0.861489
2024-05-01,-1.021844,0.428023,0.693954,1.795149
2024-04-30,-0.286796,-1.313268,-0.196895,0.668111
2024-04-29,0.182322,0.068212,-1.181531,0.106862
2024-04-28,-0.237913,0.268073,2.013493,1.473262
2024-04-27,0.814895,0.166122,0.913475,0.366533
2024-04-26,-0.715609,1.147856,-1.168268,1.10345
2024-04-25,-1.342248,-0.662813,0.750647,0.005331


### Selection by label

The __.loc__ attribute is the primary access method.<br>
Valid inputs
> A single label, e.g. 5 or "A" (Note: 5 is interpreted as a _lable_ of the index.)<br>
> A list or array of labels: ["A","B","C"]<br>
> A slice object with labels: "A":"F" (Note: Contrary to usual Python slices, __both__ the start and the stop are included. <br>
> A boolean array.<br>
> A collable.

In [51]:
s1=pd.Series(np.random.randn(6), index=list("abcdef"))
s1

a   -1.083797
b    0.293441
c   -0.174533
d    0.735109
e   -0.939600
f   -0.525197
dtype: float64

In [52]:
s1.loc["c":]

c   -0.174533
d    0.735109
e   -0.939600
f   -0.525197
dtype: float64

In [53]:
s1.loc["b"]

0.2934411110814009

In [54]:
s1.loc["c":]=0
s1

a   -1.083797
b    0.293441
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

In [55]:
df1=pd.DataFrame(np.random.randn(6,4),
                 index=list("abcdef"),
                 columns=list("ABCD"))
df1

Unnamed: 0,A,B,C,D
a,0.676463,-0.321489,-2.118852,1.638045
b,0.144912,0.658117,-1.20879,0.816253
c,-1.731704,-1.392207,0.817814,0.437226
d,0.203991,0.272214,0.196223,1.334129
e,0.32972,0.680143,-0.15975,0.063708
f,-0.553307,0.657397,-0.391259,-0.178367


In [56]:
df1.loc[["a","b","d"],:]

Unnamed: 0,A,B,C,D
a,0.676463,-0.321489,-2.118852,1.638045
b,0.144912,0.658117,-1.20879,0.816253
d,0.203991,0.272214,0.196223,1.334129


In [57]:
df1.loc["d":,"A":"C"]

Unnamed: 0,A,B,C
d,0.203991,0.272214,0.196223
e,0.32972,0.680143,-0.15975
f,-0.553307,0.657397,-0.391259


In [58]:
df1.loc["a"]

A    0.676463
B   -0.321489
C   -2.118852
D    1.638045
Name: a, dtype: float64

#### Getting values with a boolean array

In [59]:
df1.loc["a"]>0

A     True
B    False
C    False
D     True
Name: a, dtype: bool

In [60]:
df1.loc[:,df1.loc["a"]>0]

Unnamed: 0,A,D
a,0.676463,1.638045
b,0.144912,0.816253
c,-1.731704,0.437226
d,0.203991,1.334129
e,0.32972,0.063708
f,-0.553307,-0.178367


In [61]:
mask=pd.array([True, False, True, False, pd.NA, False], dtype="boolean")

In [62]:
mask

<BooleanArray>
[True, False, True, False, <NA>, False]
Length: 6, dtype: boolean

In [63]:
df1[mask]

Unnamed: 0,A,B,C,D
a,0.676463,-0.321489,-2.118852,1.638045
c,-1.731704,-1.392207,0.817814,0.437226


In [64]:
df1.loc["a","A"]

0.676463261422698