# Pandas
Written by: Sumit Sharma\
Date: 2024-08-29\
Email: sumit8444061@gmail.com

In [2]:
import pandas as pd
import numpy as np

In [4]:
# series are used for 1 dimensional data
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
# dataframes are used for 2 dimensional data
dates = pd.date_range('20240829',periods=8)
dates

DatetimeIndex(['2024-08-29', '2024-08-30', '2024-08-31', '2024-09-01',
               '2024-09-02', '2024-09-03', '2024-09-04', '2024-09-05'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df = pd.DataFrame(np.random.randn(8,2), index=dates, columns=list('AB'))
df

Unnamed: 0,A,B
2024-08-29,-0.896589,-0.074198
2024-08-30,0.223677,0.332824
2024-08-31,-1.145864,0.831348
2024-09-01,0.968566,1.227808
2024-09-02,0.980618,-1.251122
2024-09-03,-0.93748,-0.37898
2024-09-04,0.754964,-1.047863
2024-09-05,2.008444,-0.99299


In [51]:
# creating a dataframe from a dictionary
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(5)), dtype="float32"),
        "D": np.array([3] * 5, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train","test"]),
        "F": "foo",
        "G": pd.date_range('20240829',periods=5)
    }
)
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,2024-08-29
1,1.0,2013-01-02,1.0,3,train,foo,2024-08-30
2,1.0,2013-01-02,1.0,3,test,foo,2024-08-31
3,1.0,2013-01-02,1.0,3,train,foo,2024-09-01
4,1.0,2013-01-02,1.0,3,test,foo,2024-09-02


In [34]:
df.index

DatetimeIndex(['2024-08-29', '2024-08-30', '2024-08-31', '2024-09-01',
               '2024-09-02', '2024-09-03', '2024-09-04', '2024-09-05'],
              dtype='datetime64[ns]', freq='D')

In [36]:
df2.to_numpy().dtype

dtype('O')

In [39]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,B,A
2024-08-29,-0.074198,-0.896589
2024-08-30,0.332824,0.223677
2024-08-31,0.831348,-1.145864
2024-09-01,1.227808,0.968566
2024-09-02,-1.251122,0.980618
2024-09-03,-0.37898,-0.93748
2024-09-04,-1.047863,0.754964
2024-09-05,-0.99299,2.008444


In [43]:
df[0:5]

Unnamed: 0,A,B
2024-08-29,-0.896589,-0.074198
2024-08-30,0.223677,0.332824
2024-08-31,-1.145864,0.831348
2024-09-01,0.968566,1.227808
2024-09-02,0.980618,-1.251122


In [47]:
df.loc[dates[0]]

A   -0.896589
B   -0.074198
Name: 2024-08-29 00:00:00, dtype: float64

In [48]:
# iloc is used for integer based indexing
df.iloc[1,1]

np.float64(0.3328238100611093)

In [49]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[                   A         B
 2024-08-29 -0.896589 -0.074198
 2024-08-30  0.223677  0.332824
 2024-08-31 -1.145864  0.831348,
                    A         B
 2024-09-01  0.968566  1.227808
 2024-09-02  0.980618 -1.251122
 2024-09-03 -0.937480 -0.378980
 2024-09-04  0.754964 -1.047863,
                    A        B
 2024-09-05  2.008444 -0.99299]

In [50]:
pd.concat(pieces)

Unnamed: 0,A,B
2024-08-29,-0.896589,-0.074198
2024-08-30,0.223677,0.332824
2024-08-31,-1.145864,0.831348
2024-09-01,0.968566,1.227808
2024-09-02,0.980618,-1.251122
2024-09-03,-0.93748,-0.37898
2024-09-04,0.754964,-1.047863
2024-09-05,2.008444,-0.99299


In [53]:
df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,foo,2024-08-29
1,1.0,2013-01-02,1.0,3,train,foo,2024-08-30
2,1.0,2013-01-02,1.0,3,test,foo,2024-08-31
3,1.0,2013-01-02,1.0,3,train,foo,2024-09-01
4,1.0,2013-01-02,1.0,3,test,foo,2024-09-02


In [55]:
df2.sample(3)

Unnamed: 0,A,B,C,D,E,F,G
2,1.0,2013-01-02,1.0,3,test,foo,2024-08-31
4,1.0,2013-01-02,1.0,3,test,foo,2024-09-02
1,1.0,2013-01-02,1.0,3,train,foo,2024-08-30
