In [1]:
import numpy as np
import pandas as pd

## Object Creation

In [3]:
s = pd.Series([1,3,5,np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range("20250101", periods=6)
dates

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2025-01-01,-0.485034,0.565759,1.02079,0.048231
2025-01-02,1.23425,-0.017665,-2.565648,0.380577
2025-01-03,-3.096719,0.117656,1.409707,0.996535
2025-01-04,-0.354393,1.490735,1.790957,-1.017048
2025-01-05,-0.449488,-1.59751,-1.110994,0.737309
2025-01-06,1.685621,0.513039,0.514706,-0.201411


In [40]:
df2 = pd.DataFrame({
    "A": 2.0,
    "B": pd.Timestamp("20250101"),
    "C": pd.Series(1, index=list(range(8)), dtype = "float32"),
    "D": np.array([3,4,5,6, 11, 15, 20, 25], dtype='int32'),
    "E": pd.Categorical(["test", "train", "test", "train", "test", "train", "test", "train"]),
    "Z": "foo"
})
df2

Unnamed: 0,A,B,C,D,E,Z
0,2.0,2025-01-01,1.0,3,test,foo
1,2.0,2025-01-01,1.0,4,train,foo
2,2.0,2025-01-01,1.0,5,test,foo
3,2.0,2025-01-01,1.0,6,train,foo
4,2.0,2025-01-01,1.0,11,test,foo
5,2.0,2025-01-01,1.0,15,train,foo
6,2.0,2025-01-01,1.0,20,test,foo
7,2.0,2025-01-01,1.0,25,train,foo


### data types (dtypes)

In [41]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
Z           object
dtype: object

### Viewing data

In [42]:
#Top Rows
df2.head()

Unnamed: 0,A,B,C,D,E,Z
0,2.0,2025-01-01,1.0,3,test,foo
1,2.0,2025-01-01,1.0,4,train,foo
2,2.0,2025-01-01,1.0,5,test,foo
3,2.0,2025-01-01,1.0,6,train,foo
4,2.0,2025-01-01,1.0,11,test,foo


In [43]:
#Bottom Rows
df2.tail()

Unnamed: 0,A,B,C,D,E,Z
3,2.0,2025-01-01,1.0,6,train,foo
4,2.0,2025-01-01,1.0,11,test,foo
5,2.0,2025-01-01,1.0,15,train,foo
6,2.0,2025-01-01,1.0,20,test,foo
7,2.0,2025-01-01,1.0,25,train,foo


In [44]:
df.index

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06'],
              dtype='datetime64[ns]', freq='D')

In [45]:
df2.index

Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')

In [46]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [47]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'Z'], dtype='object')

In [48]:
# df2.dtypes
# df2.to_numpy()

In [51]:
# Describe numeric columns by default ifmixed
df2.describe()

# Describe all columns
df2.describe(include='all')

Unnamed: 0,A,B,C,D,E,Z
count,8.0,8,8.0,8.0,8,8
unique,,,,,2,1
top,,,,,test,foo
freq,,,,,4,8
mean,2.0,2025-01-01 00:00:00,1.0,11.125,,
min,2.0,2025-01-01 00:00:00,1.0,3.0,,
25%,2.0,2025-01-01 00:00:00,1.0,4.75,,
50%,2.0,2025-01-01 00:00:00,1.0,8.5,,
75%,2.0,2025-01-01 00:00:00,1.0,16.25,,
max,2.0,2025-01-01 00:00:00,1.0,25.0,,


In [52]:
# Transposing your data

In [53]:
df2.T

Unnamed: 0,0,1,2,3,4,5,6,7
A,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
B,2025-01-01 00:00:00,2025-01-01 00:00:00,2025-01-01 00:00:00,2025-01-01 00:00:00,2025-01-01 00:00:00,2025-01-01 00:00:00,2025-01-01 00:00:00,2025-01-01 00:00:00
C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
D,3,4,5,6,11,15,20,25
E,test,train,test,train,test,train,test,train
Z,foo,foo,foo,foo,foo,foo,foo,foo


In [59]:
#sorting your data
#by index
df2.sort_index(axis=1, ascending=False)

Unnamed: 0,Z,E,D,C,B,A
0,foo,test,3,1.0,2025-01-01,2.0
1,foo,train,4,1.0,2025-01-01,2.0
2,foo,test,5,1.0,2025-01-01,2.0
3,foo,train,6,1.0,2025-01-01,2.0
4,foo,test,11,1.0,2025-01-01,2.0
5,foo,train,15,1.0,2025-01-01,2.0
6,foo,test,20,1.0,2025-01-01,2.0
7,foo,train,25,1.0,2025-01-01,2.0


In [66]:
df2.sort_values(by='D', ascending=True)
df2.sort_values(by=['E','D'], ascending=False)

Unnamed: 0,A,B,C,D,E,Z
7,2.0,2025-01-01,1.0,25,train,foo
5,2.0,2025-01-01,1.0,15,train,foo
3,2.0,2025-01-01,1.0,6,train,foo
1,2.0,2025-01-01,1.0,4,train,foo
6,2.0,2025-01-01,1.0,20,test,foo
4,2.0,2025-01-01,1.0,11,test,foo
2,2.0,2025-01-01,1.0,5,test,foo
0,2.0,2025-01-01,1.0,3,test,foo


### Selection

#### GetItem([])

In [67]:
# GetItem([])
df['A']

2025-01-01   -0.485034
2025-01-02    1.234250
2025-01-03   -3.096719
2025-01-04   -0.354393
2025-01-05   -0.449488
2025-01-06    1.685621
Freq: D, Name: A, dtype: float64

In [71]:
df[0:3]

Unnamed: 0,A,B,C,D
2025-01-01,-0.485034,0.565759,1.02079,0.048231
2025-01-02,1.23425,-0.017665,-2.565648,0.380577
2025-01-03,-3.096719,0.117656,1.409707,0.996535


#### Selection by label

In [72]:
# Selection by Label
df.loc[dates[0]]

A   -0.485034
B    0.565759
C    1.020790
D    0.048231
Name: 2025-01-01 00:00:00, dtype: float64

In [73]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2025-01-01,-0.485034,0.565759
2025-01-02,1.23425,-0.017665
2025-01-03,-3.096719,0.117656
2025-01-04,-0.354393,1.490735
2025-01-05,-0.449488,-1.59751
2025-01-06,1.685621,0.513039


In [75]:
df.loc["20250102":"20250104", ["A", "B"]]

Unnamed: 0,A,B
2025-01-02,1.23425,-0.017665
2025-01-03,-3.096719,0.117656
2025-01-04,-0.354393,1.490735


In [76]:
df.loc[dates[0], 'A']

np.float64(-0.485033599251505)

#### Selection by position

In [77]:
df.iloc[3]

A   -0.354393
B    1.490735
C    1.790957
D   -1.017048
Name: 2025-01-04 00:00:00, dtype: float64

In [78]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2025-01-04,-0.354393,1.490735
2025-01-05,-0.449488,-1.59751


In [79]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2025-01-02,1.23425,-2.565648
2025-01-03,-3.096719,1.409707
2025-01-05,-0.449488,-1.110994


In [81]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2025-01-02,1.23425,-0.017665,-2.565648,0.380577
2025-01-03,-3.096719,0.117656,1.409707,0.996535


In [80]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2025-01-01,0.565759,1.02079
2025-01-02,-0.017665,-2.565648
2025-01-03,0.117656,1.409707
2025-01-04,1.490735,1.790957
2025-01-05,-1.59751,-1.110994
2025-01-06,0.513039,0.514706


In [82]:
df.iloc[2,2]

np.float64(1.4097074036551032)

In [83]:
df.iat[2,2]

np.float64(1.4097074036551032)

#### Boolean Indexing

In [88]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2025-01-02,1.23425,-0.017665,-2.565648,0.380577
2025-01-06,1.685621,0.513039,0.514706,-0.201411


In [89]:
df[df > 0]

Unnamed: 0,A,B,C,D
2025-01-01,,0.565759,1.02079,0.048231
2025-01-02,1.23425,,,0.380577
2025-01-03,,0.117656,1.409707,0.996535
2025-01-04,,1.490735,1.790957,
2025-01-05,,,,0.737309
2025-01-06,1.685621,0.513039,0.514706,


In [93]:
df3 = df.copy()

df3['E'] = ["one", "two", "three", "four", "one", "four"]
df3

Unnamed: 0,A,B,C,D,E
2025-01-01,-0.485034,0.565759,1.02079,0.048231,one
2025-01-02,1.23425,-0.017665,-2.565648,0.380577,two
2025-01-03,-3.096719,0.117656,1.409707,0.996535,three
2025-01-04,-0.354393,1.490735,1.790957,-1.017048,four
2025-01-05,-0.449488,-1.59751,-1.110994,0.737309,one
2025-01-06,1.685621,0.513039,0.514706,-0.201411,four


In [94]:
df3[df3['E'].isin(['one', 'four'])]

Unnamed: 0,A,B,C,D,E
2025-01-01,-0.485034,0.565759,1.02079,0.048231,one
2025-01-04,-0.354393,1.490735,1.790957,-1.017048,four
2025-01-05,-0.449488,-1.59751,-1.110994,0.737309,one
2025-01-06,1.685621,0.513039,0.514706,-0.201411,four


#### Setting

In [96]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20250102", periods=6))
s1

2025-01-02    1
2025-01-03    2
2025-01-04    3
2025-01-05    4
2025-01-06    5
2025-01-07    6
Freq: D, dtype: int64

In [97]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2025-01-01,-0.485034,0.565759,1.02079,0.048231,
2025-01-02,1.23425,-0.017665,-2.565648,0.380577,1.0
2025-01-03,-3.096719,0.117656,1.409707,0.996535,2.0
2025-01-04,-0.354393,1.490735,1.790957,-1.017048,3.0
2025-01-05,-0.449488,-1.59751,-1.110994,0.737309,4.0
2025-01-06,1.685621,0.513039,0.514706,-0.201411,5.0


In [107]:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,1.02079,0.048231,
2025-01-02,0.0,-0.017665,-2.565648,0.380577,1.0
2025-01-03,-3.096719,0.117656,1.409707,0.996535,2.0
2025-01-04,-0.354393,1.490735,1.790957,-1.017048,3.0
2025-01-05,-0.449488,-1.59751,-1.110994,0.737309,4.0
2025-01-06,1.685621,0.513039,0.514706,-0.201411,5.0


In [106]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,1.02079,0.048231,
2025-01-02,0.0,-0.017665,-2.565648,0.380577,1.0
2025-01-03,-3.096719,0.117656,1.409707,0.996535,2.0
2025-01-04,-0.354393,1.490735,1.790957,-1.017048,3.0
2025-01-05,-0.449488,-1.59751,-1.110994,0.737309,4.0
2025-01-06,1.685621,0.513039,0.514706,-0.201411,5.0


In [109]:
df.loc[:, "D"] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,1.02079,5.0,
2025-01-02,0.0,-0.017665,-2.565648,5.0,1.0
2025-01-03,-3.096719,0.117656,1.409707,5.0,2.0
2025-01-04,-0.354393,1.490735,1.790957,5.0,3.0
2025-01-05,-0.449488,-1.59751,-1.110994,5.0,4.0
2025-01-06,1.685621,0.513039,0.514706,5.0,5.0


In [113]:
df4 = df.copy()
df4[df4 > 0] = -df4
df4

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,-1.02079,-5.0,
2025-01-02,0.0,-0.017665,-2.565648,-5.0,-1.0
2025-01-03,-3.096719,-0.117656,-1.409707,-5.0,-2.0
2025-01-04,-0.354393,-1.490735,-1.790957,-5.0,-3.0
2025-01-05,-0.449488,-1.59751,-1.110994,-5.0,-4.0
2025-01-06,-1.685621,-0.513039,-0.514706,-5.0,-5.0


### Missing Data

In [120]:
df_ = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df_.loc[dates[0] : dates[1], "E"] = 1
df_

Unnamed: 0,A,B,C,D,F,E
2025-01-01,0.0,0.0,1.02079,5.0,,1.0
2025-01-02,0.0,-0.017665,-2.565648,5.0,1.0,1.0
2025-01-03,-3.096719,0.117656,1.409707,5.0,2.0,
2025-01-04,-0.354393,1.490735,1.790957,5.0,3.0,


In [117]:
df_.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2025-01-02,0.0,-0.017665,-2.565648,5.0,1.0,1.0


In [118]:
df_.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2025-01-01,0.0,0.0,1.02079,5.0,5.0,1.0
2025-01-02,0.0,-0.017665,-2.565648,5.0,1.0,1.0
2025-01-03,-3.096719,0.117656,1.409707,5.0,2.0,5.0
2025-01-04,-0.354393,1.490735,1.790957,5.0,3.0,5.0


In [121]:
pd.isna(df_)

Unnamed: 0,A,B,C,D,F,E
2025-01-01,False,False,False,False,True,False
2025-01-02,False,False,False,False,False,False
2025-01-03,False,False,False,False,False,True
2025-01-04,False,False,False,False,False,True


### Operations

#### Stats

In [122]:
df.mean

<bound method DataFrame.mean of                    A         B         C    D    F
2025-01-01  0.000000  0.000000  1.020790  5.0  NaN
2025-01-02  0.000000 -0.017665 -2.565648  5.0  1.0
2025-01-03 -3.096719  0.117656  1.409707  5.0  2.0
2025-01-04 -0.354393  1.490735  1.790957  5.0  3.0
2025-01-05 -0.449488 -1.597510 -1.110994  5.0  4.0
2025-01-06  1.685621  0.513039  0.514706  5.0  5.0>

In [125]:
df.mean(axis=0)

A   -0.369163
B    0.084376
C    0.176586
D    5.000000
F    3.000000
dtype: float64

In [128]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2025-01-01    NaN
2025-01-02    NaN
2025-01-03    1.0
2025-01-04    3.0
2025-01-05    5.0
2025-01-06    NaN
Freq: D, dtype: float64

In [129]:
df.sub(s, axis="index")
df

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,1.02079,5.0,
2025-01-02,0.0,-0.017665,-2.565648,5.0,1.0
2025-01-03,-3.096719,0.117656,1.409707,5.0,2.0
2025-01-04,-0.354393,1.490735,1.790957,5.0,3.0
2025-01-05,-0.449488,-1.59751,-1.110994,5.0,4.0
2025-01-06,1.685621,0.513039,0.514706,5.0,5.0


#### User defined function

In [130]:
df.agg(lambda x: np.mean(x) * 5.6)

A    -2.067313
B     0.472504
C     0.988884
D    28.000000
F    16.800000
dtype: float64

In [131]:
df.transform(lambda x: x * 101.2)

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,103.303932,506.0,
2025-01-02,0.0,-1.787714,-259.643606,506.0,101.2
2025-01-03,-313.387924,11.906751,142.662389,506.0,202.4
2025-01-04,-35.864525,150.862404,181.244837,506.0,303.6
2025-01-05,-45.488178,-161.668052,-112.432563,506.0,404.8
2025-01-06,170.584809,51.919551,52.088262,506.0,506.0


#### Value counts

In [137]:
s = pd.Series(np.random.randint(0, 7, size=10))
s
s.value_counts()

3    3
1    2
2    2
6    2
4    1
Name: count, dtype: int64

#### String methods

In [138]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object