# FUNDAMENTAL OF PANDAS

# <span style="color:Orange">Basic data structures in pandas</span>

### Series: a one-dimensional labeled array holding data of any type such as integers, strings, Python objects etc.
### DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

# Series

In [424]:
import pandas as pd
import numpy as np
data = [10, 20, 30, 40, 50]
index = ['A', 'B', 'C', 'D', 'E']

s = pd.Series(data, index=index)
print(s)

A    10
B    20
C    30
D    40
E    50
dtype: int64


### From ndarray

In [425]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
s

a    0.143273
b    0.687236
c   -0.617199
d   -1.043313
e   -0.293267
dtype: float64

In [426]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [427]:
pd.Series(np.random.randn(5))

0   -0.552560
1    0.042615
2   -0.475744
3   -1.287017
4   -0.824705
dtype: float64

### From dict

In [428]:
d = {"b": 1, "a": 0, "c": 2}

pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [429]:
d = {"a": 0.0, "b": 1.0, "c": 2.0}

pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

### From scalar value

In [430]:
pd.Series(5.0, index=["a", "b", "c", "d", "e"])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

### Series is ndarray-like

In [431]:
s.iloc[0]

0.1432731291207189

In [432]:
s.iloc[:3]

a    0.143273
b    0.687236
c   -0.617199
dtype: float64

In [433]:
s[s > s.median()]

a    0.143273
b    0.687236
dtype: float64

In [434]:
s.iloc[[4, 3, 1]]

e   -0.293267
d   -1.043313
b    0.687236
dtype: float64

In [435]:
np.exp(s)

a    1.154045
b    1.988213
c    0.539453
d    0.352286
e    0.745823
dtype: float64

### Like a NumPy array, a pandas Series has a single dtype.

In [436]:
s.dtype

dtype('float64')

### if you need the actual array backing a Series, use Series.array.


In [437]:
s.array

<NumpyExtensionArray>
[  0.1432731291207189,   0.6872363216535309,   -0.617198978784513,
  -1.0433128833495142, -0.29326704914803065]
Length: 5, dtype: float64

### While Series is ndarray-like, if you need an actual ndarray, then use Series.to_numpy().

In [438]:
s.to_numpy()

array([ 0.14327313,  0.68723632, -0.61719898, -1.04331288, -0.29326705])

### Series is dict-like

In [439]:
s["a"]

0.1432731291207189

In [440]:
s["e"] = 12.0
s

a     0.143273
b     0.687236
c    -0.617199
d    -1.043313
e    12.000000
dtype: float64

In [441]:
"e" in s

True

In [442]:
"f" in s

False

### Using the Series.get() method, a missing label will return None or specified default.

In [443]:
s.get("f")
s.get("f", np.nan)

nan

###  Vectorized operations and label alignment with Series

In [444]:
s + s

a     0.286546
b     1.374473
c    -1.234398
d    -2.086626
e    24.000000
dtype: float64

In [445]:
s * 2

a     0.286546
b     1.374473
c    -1.234398
d    -2.086626
e    24.000000
dtype: float64

In [446]:
np.exp(s)

a         1.154045
b         1.988213
c         0.539453
d         0.352286
e    162754.791419
dtype: float64

In [447]:
s.iloc[1:] + s.iloc[:-1]

a         NaN
b    1.374473
c   -1.234398
d   -2.086626
e         NaN
dtype: float64

### Name attribute

In [448]:
s = pd.Series(np.random.randn(5), name="something")
s

0    1.189244
1   -2.142157
2   -2.005858
3   -0.872330
4   -1.074540
Name: something, dtype: float64

In [449]:
s.name

'something'

###  You can rename a Series with the pandas.Series.rename() method.

In [450]:
s2 = s.rename("different")

s2.name

'different'

# DataFrame

### From dict of Series or dicts

In [451]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}


df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [452]:
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [453]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [454]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [455]:
df.columns

Index(['one', 'two'], dtype='object')

### From dict of ndarrays / lists

In [456]:
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}

pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [457]:
pd.DataFrame(d, index=["a", "b", "c", "d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


###  From structured or record array

In [458]:
ata = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])

data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]

pd.DataFrame(data)

Unnamed: 0,0,1,2
0,1,2.0,Hello
1,2,3.0,World


In [459]:
pd.DataFrame(data, index=["first", "second"])

Unnamed: 0,0,1,2
first,1,2.0,Hello
second,2,3.0,World


In [460]:
pd.DataFrame(data, columns=["C", "A", "B"])

Unnamed: 0,C,A,B
0,1,2.0,Hello
1,2,3.0,World


### From a list of dicts`

In [461]:
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]

pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [462]:
pd.DataFrame(data2, index=["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [463]:
pd.DataFrame(data2, columns=["a", "b"])

Unnamed: 0,a,b
0,1,2
1,5,10


### From a dict of tuples

In [464]:
pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


### From a Series

In [465]:
ser = pd.Series(range(3), index=list("abc"), name="ser")

pd.DataFrame(ser)

Unnamed: 0,ser
a,0
b,1
c,2


###  From a list of namedtuples

In [466]:
from collections import namedtuple

Point = namedtuple("Point", "x y")

pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


In [467]:
Point3D = namedtuple("Point3D", "x y z")

pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)])

Unnamed: 0,x,y,z
0,0,0,0.0
1,0,3,5.0
2,2,3,


### From a list of dataclasses

In [468]:
from dataclasses import make_dataclass

Point = make_dataclass("Point", [("x", int), ("y", int)])

pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


# Alternate constructors

### DataFrame.from_dict

In [469]:
In [68]: pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))


Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [470]:
pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],
)

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


### DataFrame.from_records

In [471]:
data

[(1, 2.0, 'Hello'), (2, 3.0, 'World')]

### Column selection, addition, deletion

In [472]:
df["one"]

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [473]:
df["three"] = df["one"] * df["two"]
df["flag"] = df["one"] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [474]:
del df["two"]

three = df.pop("three")
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [475]:
df["foo"] = "bar"

df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [476]:
df["one_trunc"] = df["one"][:2]

df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [477]:
df.insert(1, "bar", df["one"])

df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


### Data alignment and arithmetic

In [478]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])

df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])

df + df2

Unnamed: 0,A,B,C,D
0,1.393092,0.42895,1.502109,
1,-0.877164,-1.245423,0.860626,
2,0.136225,-1.22291,-0.226088,
3,0.490335,0.984914,-1.122156,
4,1.266195,-3.264793,-1.993123,
5,0.191501,1.153428,-0.699272,
6,-1.480862,-0.99785,-2.556474,
7,,,,
8,,,,
9,,,,


When doing an operation between DataFrame and Series, the default behavior is to align the Series index on the DataFrame columns, thus broadcasting row-wise. For example:

In [479]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-0.182392,-1.07918,0.25394,0.222822
2,0.104509,-2.275946,-1.569887,1.644206
3,-0.478562,-0.729489,-0.118837,0.205118
4,0.796519,-2.909664,-1.728866,1.314251
5,-1.007168,-1.059033,-1.123347,-0.381412
6,-2.11892,0.048611,-1.263007,2.15798
7,0.37157,-1.669234,0.021939,-0.724113
8,-0.2572,0.045436,-1.354531,0.173711
9,-1.104528,0.896014,-1.884129,1.257702


### Arithmetic operations with scalars operate element-wise:

In [480]:
df * 5 + 2

Unnamed: 0,A,B,C,D
0,5.447955,7.363724,6.715703,-2.631325
1,4.535994,1.967823,7.985403,-1.517214
2,5.970499,-4.016004,-1.133733,5.589704
3,3.055145,3.71628,6.121517,-1.605736
4,9.43055,-7.184597,-1.928628,3.939931
5,0.412113,2.068559,1.098969,-4.538385
6,-5.146644,7.606777,0.400668,8.158574
7,7.305804,-0.982445,6.825396,-6.251891
8,4.161954,7.590904,-0.056954,-1.762772
9,-0.074687,11.843792,-2.704942,3.657184


In [481]:
1 / df

Unnamed: 0,A,B,C,D
0,1.450135,0.932188,1.060287,-1.079605
1,1.971613,-155.390744,0.835366,-1.42158
2,1.259288,-0.831116,-1.595541,1.392872
3,4.738686,2.913277,1.213145,-1.386679
4,0.672898,-0.54439,-1.272709,2.577411
5,-3.148839,72.929588,-5.549197,-0.764715
6,-0.699629,0.891778,-3.126305,0.811876
7,0.942364,-1.676477,1.036184,-0.605922
8,2.312722,0.89431,-2.430779,-1.328808
9,-2.410002,0.507934,-1.062712,3.017167


In [482]:
df ** 4

Unnamed: 0,A,B,C,D
0,0.226134,1.324298,0.791236,0.736107
1,0.066178,1.715139e-09,2.053494,0.244858
2,0.397649,2.095812,0.154301,0.265677
3,0.001983,0.01388267,0.461688,0.270456
4,4.877573,11.38572,0.381139,0.02266
5,0.010172,3.534964e-08,0.001055,2.924165
6,4.17377,1.58115,0.010468,2.301657
7,1.268016,0.1265931,0.867465,7.418803
8,0.034955,1.563321,0.028643,0.320739
9,0.029644,15.02345,0.784038,0.012067


### Boolean operators operate element-wise as well:

In [483]:
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)

df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)

df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [484]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [485]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [486]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


### Transposing

In [487]:
df[:5].T

Unnamed: 0,0,1,2,3,4
A,0.689591,0.507199,0.7941,0.211029,1.48611
B,1.072745,-0.006435,-1.203201,0.343256,-1.836919
C,0.943141,1.197081,-0.626747,0.824303,-0.785726
D,-0.926265,-0.703443,0.717941,-0.721147,0.387986


### DataFrame interoperability with NumPy functions

In [488]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,1.9929,2.923392,2.568034,0.39603
1,1.660633,0.993585,3.310438,0.494879
2,2.212448,0.300232,0.534327,2.050207
3,1.234948,1.40953,2.280292,0.486194
4,4.419869,0.159307,0.455789,1.47401
5,0.72791,1.013806,0.835098,0.270448
6,0.23947,3.069011,0.726246,3.427101
7,2.889723,0.550742,2.624996,0.191977
8,1.540937,3.059284,0.662728,0.471161
9,0.660382,7.161779,0.390242,1.392968


In [489]:
ser = pd.Series([1, 2, 3, 4])

np.exp(ser)

0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64

In [490]:
ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"])

ser2 = pd.Series([1, 3, 5], index=["b", "a", "c"])

ser1

a    1
b    2
c    3
dtype: int64

In [491]:
ser2

b    1
a    3
c    5
dtype: int64

In [492]:
np.remainder(ser1, ser2)

a    1
b    0
c    3
dtype: int64

As usual, the union of the two indices is taken, and non-overlapping values are filled with missing values.

In [493]:
ser3 = pd.Series([2, 4, 6], index=["b", "c", "d"])

ser3

b    2
c    4
d    6
dtype: int64

In [494]:
np.remainder(ser1, ser3)

a    NaN
b    0.0
c    3.0
d    NaN
dtype: float64

When a binary ufunc is applied to a Series and Index, the Series implementation takes precedence and a Series is returned.

In [495]:
ser = pd.Series([1, 2, 3])

idx = pd.Index([4, 5, 6])

np.maximum(ser, idx)

0    4
1    5
2    6
dtype: int64

# <span style="color:Orange">Essential basic functionality</span>

In [496]:
index = pd.date_range("1/1/2000", periods=8)

s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

### Head and tail

In [497]:
long_series = pd.Series(np.random.randn(1000))

long_series.head()

0   -0.005006
1    0.189356
2    0.863760
3   -0.410021
4   -0.414845
dtype: float64

In [498]:
long_series.tail(3)

997   -0.384218
998   -0.752685
999   -1.774510
dtype: float64

###  Attributes and underlying data

In [499]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.859613,1.065848,0.526059
2000-01-02,-0.535654,1.436251,0.758054


In [500]:
df.columns = [x.lower() for x in df.columns]

df

Unnamed: 0,a,b,c
2000-01-01,0.859613,1.065848,0.526059
2000-01-02,-0.535654,1.436251,0.758054
2000-01-03,0.564847,1.350239,0.552796
2000-01-04,1.207824,1.16203,0.437733
2000-01-05,-0.154335,0.748899,0.160679
2000-01-06,2.522307,-0.81192,-0.976934
2000-01-07,0.119246,-0.079344,-2.057474
2000-01-08,-0.626845,-1.219718,-0.540076


To get the actual data inside a Index or Series, use the .array property

In [501]:
s.array

<NumpyExtensionArray>
[ 1.1505563744302605,  -2.262206439691548, -0.6524285981088137,
  0.6971280541593268, 0.08849390643287247]
Length: 5, dtype: float64

In [502]:
s.index.array

<NumpyExtensionArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

if you know you need a NumPy array, use to_numpy() or numpy.asarray().

In [503]:
s.to_numpy()

array([ 1.15055637, -2.26220644, -0.6524286 ,  0.69712805,  0.08849391])

In [504]:
np.asarray(s)

array([ 1.15055637, -2.26220644, -0.6524286 ,  0.69712805,  0.08849391])

An object-dtype numpy.ndarray with Timestamp objects, each with the correct tz

A datetime64 -dtype numpy.ndarray, where the values have been converted to UTC and the timezone discarded

Timezones may be preserved with dtype=object

In [505]:
er = pd.Series(pd.date_range("2000", periods=2, tz="CET"))

ser.to_numpy(dtype=object)

array([1, 2, 3], dtype=object)

Or thrown away with dtype=datetime64

In [506]:
ser.to_numpy(dtype="datetime64[ns]")

array(['1970-01-01T00:00:00.000000001', '1970-01-01T00:00:00.000000002',
       '1970-01-01T00:00:00.000000003'], dtype='datetime64[ns]')

### Flexible binary operations
### Matching / broadcasting behavior

In [507]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)


df

Unnamed: 0,one,two,three
a,0.927875,-0.322818,
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412
d,,0.242219,0.859096


In [508]:
row = df.iloc[1]

column = df["two"]

df.sub(row, axis="columns")

Unnamed: 0,one,two,three
a,1.996197,1.039838,
b,0.0,0.0,0.0
c,1.030097,0.410139,0.888265
d,,1.604874,0.091949


In [509]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,1.996197,1.039838,
b,0.0,0.0,0.0
c,1.030097,0.410139,0.888265
d,,1.604874,0.091949


In [510]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,1.250692,0.0,
b,0.294333,0.0,2.129803
c,0.914291,0.0,2.607929
d,,0.0,0.616877


In [511]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,1.250692,0.0,
b,0.294333,0.0,2.129803
c,0.914291,0.0,2.607929
d,,0.0,0.616877


### Furthermore you can align a level of a MultiIndexed DataFrame with a Series.`

In [512]:
dfmi = df.copy()

dfmi.index = pd.MultiIndex.from_tuples(
    [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"]
)


dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,1.250692,0.0,
1,b,0.294333,0.0,2.129803
1,c,0.914291,0.0,2.607929
2,a,,0.565037,1.181914


In [513]:
s = pd.Series(np.arange(10))

s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [514]:
div, rem = divmod(s, 3)

div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int32

In [515]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

In [516]:
idx = pd.Index(np.arange(10))
idx

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')

In [517]:
div, rem = divmod(idx, 3)
div

Index([0, 0, 0, 1, 1, 1, 2, 2, 2, 3], dtype='int32')

In [518]:
rem

Index([0, 1, 2, 0, 1, 2, 0, 1, 2, 0], dtype='int32')

### We can also do elementwise divmod().

In [519]:
div, rem = divmod(s, [2, 2, 3, 3, 4, 4, 5, 5, 6, 6])

div

0    0
1    0
2    0
3    1
4    1
5    1
6    1
7    1
8    1
9    1
dtype: int32

In [520]:
rem

0    0
1    1
2    2
3    0
4    0
5    1
6    1
7    2
8    2
9    3
dtype: int32

###  Missing data / operations with fill values

In [521]:
df2 = df.copy()

df2.loc["a", "three"] = 1.0

df

Unnamed: 0,one,two,three
a,0.927875,-0.322818,
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412
d,,0.242219,0.859096


In [522]:
df2

Unnamed: 0,one,two,three
a,0.927875,-0.322818,1.0
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412
d,,0.242219,0.859096


In [523]:
df + df2

Unnamed: 0,one,two,three
a,1.855749,-0.645635,
b,-2.136645,-2.725311,1.534294
c,-0.076451,-1.905033,3.310824
d,,0.484438,1.718193


In [524]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,1.855749,-0.645635,1.0
b,-2.136645,-2.725311,1.534294
c,-0.076451,-1.905033,3.310824
d,,0.484438,1.718193


### Flexible comparisons.

In [525]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [526]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


### Boolean reductions

In [527]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [528]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [529]:
(df > 0).any().any()

True

In [530]:
df.empty

False

In [531]:
pd.DataFrame(columns=list("ABC")).empty

True

### Comparing if objects are equivalent

In [532]:
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [533]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [534]:
np.nan == np.nan

False

In [535]:
(df + df).equals(df * 2)

True

In [536]:
df1 = pd.DataFrame({"col": ["foo", 0, np.nan]})

df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0])

df1.equals(df2)

False

In [537]:
df1.equals(df2.sort_index())

True

### Comparing array-like objects

In [538]:
pd.Series(["foo", "bar", "baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [539]:
pd.Index(["foo", "bar", "baz"]) == "foo"

array([ True, False, False])

In [540]:
pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [541]:
pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

### Combining overlapping data sets

In [542]:
df1 = pd.DataFrame(
    {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
)


df2 = pd.DataFrame(
    {
        "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
        "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
    }
)


df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [543]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


### General DataFrame combine

In [544]:
def combiner(x, y):
    return np.where(pd.isna(x), y, x)


df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


### Descriptive statistics

In [545]:
df

Unnamed: 0,one,two,three
a,0.927875,-0.322818,
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412
d,,0.242219,0.859096


In [546]:
df.mean(0)

one     -0.059558
two     -0.598943
three    1.093885
dtype: float64

In [547]:
df.mean(1)

a    0.302528
b   -0.554610
c    0.221557
d    0.550658
dtype: float64

In [548]:
df.sum(0, skipna=False)

one           NaN
two     -2.395771
three         NaN
dtype: float64

In [549]:
df.sum(axis=1, skipna=True)

a    0.605057
b   -1.663831
c    0.664670
d    1.101315
dtype: float64

In [550]:
ts_stand = (df - df.mean()) / df.std()

ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

In [551]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [552]:
df.cumsum()

Unnamed: 0,one,two,three
a,0.927875,-0.322818,
b,-0.140448,-1.685473,0.767147
c,-0.178673,-2.63799,2.422559
d,,-2.395771,3.281656


### Summarizing data: describe

In [553]:
series = pd.Series(np.random.randn(1000))

series[::2] = np.nan

series.describe()

count    500.000000
mean       0.009257
std        1.048106
min       -2.608805
25%       -0.676571
50%        0.012644
75%        0.711840
max        3.225748
dtype: float64

In [554]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])

frame.iloc[::2] = np.nan

frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.018548,0.057937,0.002676,0.013466,0.04573
std,0.998693,0.981604,0.959906,0.975855,1.027199
min,-3.618929,-2.658083,-3.994919,-2.967351,-3.576666
25%,-0.687559,-0.582694,-0.622716,-0.615652,-0.600142
50%,-0.024709,0.049265,-0.007178,-0.059017,0.037628
75%,0.708873,0.717832,0.686585,0.699941,0.692475
max,2.550536,4.163885,2.672372,2.767437,2.818727


In [555]:
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

count    500.000000
mean       0.009257
std        1.048106
min       -2.608805
5%        -1.708816
25%       -0.676571
50%        0.012644
75%        0.711840
95%        1.648296
max        3.225748
dtype: float64

In [556]:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])

s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [557]:
frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})

frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [558]:
frame.describe(include=["object"])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [559]:
frame.describe(include="all")

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


### Index of min/max values

In [560]:
s1 = pd.Series(np.random.randn(5))

s1

0    0.567139
1    0.449701
2   -0.558435
3    0.983196
4    0.747561
dtype: float64

In [561]:
s1.idxmin(), s1.idxmax()

(2, 3)

In [562]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])

df1

Unnamed: 0,A,B,C
0,-1.627872,0.166063,-0.406416
1,-0.098068,0.847834,-0.389355
2,0.862009,-0.684047,-0.46797
3,0.064373,-0.387549,0.988016
4,-1.403562,-0.973347,-0.371322


In [563]:
df1.idxmin(axis=0)

A    0
B    4
C    2
dtype: int64

In [564]:
df1.idxmax(axis=1)

0    B
1    B
2    A
3    C
4    C
dtype: object

In [565]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=["A"], index=list("edcba"))

df3

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [566]:
df3["A"].idxmin()

'd'

### Value counts (histogramming) / mode

In [567]:
ata = np.random.randint(0, 7, size=50)

data

[(1, 2.0, 'Hello'), (2, 3.0, 'World')]

In [568]:
s = pd.Series(data)

s.value_counts()

(1, 2.0, Hello)    1
(2, 3.0, World)    1
Name: count, dtype: int64

In [569]:
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}

frame = pd.DataFrame(data)

frame.value_counts()

a  b
1  x    1
2  x    1
3  y    1
4  y    1
Name: count, dtype: int64

In [570]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

s5.mode()

0    3
1    7
dtype: int64

In [571]:
df5 = pd.DataFrame(
    {
        "A": np.random.randint(0, 7, size=50),
        "B": np.random.randint(-10, 15, size=50),
    }
)


df5.mode()

Unnamed: 0,A,B
0,5.0,-10
1,,5


### Discretization and quantiling

In [572]:
arr = np.random.randn(20)

factor = pd.cut(arr, 4)

factor

[(0.223, 0.842], (0.223, 0.842], (0.223, 0.842], (0.842, 1.461], (0.223, 0.842], ..., (-0.395, 0.223], (-0.395, 0.223], (0.223, 0.842], (0.842, 1.461], (-1.017, -0.395]]
Length: 20
Categories (4, interval[float64, right]): [(-1.017, -0.395] < (-0.395, 0.223] < (0.223, 0.842] < (0.842, 1.461]]

In [573]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])
factor

[(0, 1], (0, 1], (0, 1], (1, 5], (0, 1], ..., (-1, 0], (0, 1], (0, 1], (0, 1], (-1, 0]]
Length: 20
Categories (4, interval[int64, right]): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [574]:
arr = np.random.randn(30)

factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1])

factor

[(-0.751, -0.437], (0.654, 2.002], (-0.751, -0.437], (0.654, 2.002], (-0.437, 0.654], ..., (-0.751, -0.437], (-3.4099999999999997, -0.751], (-3.4099999999999997, -0.751], (0.654, 2.002], (-0.437, 0.654]]
Length: 30
Categories (4, interval[float64, right]): [(-3.4099999999999997, -0.751] < (-0.751, -0.437] < (-0.437, 0.654] < (0.654, 2.002]]

In [575]:
arr = np.random.randn(20)

factor = pd.cut(arr, [-np.inf, 0, np.inf])

factor

[(0.0, inf], (0.0, inf], (-inf, 0.0], (0.0, inf], (-inf, 0.0], ..., (-inf, 0.0], (0.0, inf], (0.0, inf], (-inf, 0.0], (0.0, inf]]
Length: 20
Categories (2, interval[float64, right]): [(-inf, 0.0] < (0.0, inf]]

### Function application
### Tablewise function application

In [576]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df


df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})

In [577]:
add_country_name(extract_city_name(df_p), country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [578]:
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


### Row or column-wise function application

In [579]:
df.apply(lambda x: np.mean(x))

one     -0.059558
two     -0.598943
three    1.093885
dtype: float64

In [580]:
df.apply(lambda x: np.mean(x), axis=1)

a    0.302528
b   -0.554610
c    0.221557
d    0.550658
dtype: float64

In [581]:
df.apply(lambda x: x.max() - x.min())

one      1.996197
two      1.604874
three    0.888265
dtype: float64

In [582]:
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,0.927875,-0.322818,
b,-0.140448,-1.685473,0.767147
c,-0.178673,-2.63799,2.422559
d,,-2.395771,3.281656


In [583]:
df.apply(np.exp)

Unnamed: 0,one,two,three
a,2.529128,0.724106,
b,0.343584,0.25598,2.153614
c,0.962496,0.385769,5.235237
d,,1.274073,2.361026


In [584]:
df.apply("mean")

one     -0.059558
two     -0.598943
three    1.093885
dtype: float64

In [585]:
df.apply("mean", axis=1)

a    0.302528
b   -0.554610
c    0.221557
d    0.550658
dtype: float64

### Aggregation API

In [586]:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=10),
)


tsdf.iloc[3:7] = np.nan

tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.323053,0.675285,0.105912
2000-01-02,2.38749,1.864163,-0.21406
2000-01-03,0.674005,0.126785,0.189673
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,-0.138951,0.518695,1.876069
2000-01-09,-0.293241,-0.330951,1.040753
2000-01-10,-1.422656,-0.416822,-0.255735


In [587]:
tsdf.agg(lambda x: np.sum(x))

A    0.883593
B    2.437155
C    2.742612
dtype: float64

In [588]:
tsdf.agg("sum")

A    0.883593
B    2.437155
C    2.742612
dtype: float64

### Aggregating with multiple functions

In [589]:
tsdf.agg(["sum"])


Unnamed: 0,A,B,C
sum,0.883593,2.437155,2.742612


In [590]:
tsdf.agg(["sum", "mean"])

Unnamed: 0,A,B,C
sum,0.883593,2.437155,2.742612
mean,0.147266,0.406192,0.457102


In [591]:
tsdf["A"].agg(["sum", lambda x: x.mean()])

sum         0.883593
<lambda>    0.147266
Name: A, dtype: float64

In [592]:
def mymean(x):
    return x.mean()

In [593]:
tsdf["A"].agg(["sum", mymean])

sum       0.883593
mymean    0.147266
Name: A, dtype: float64

### Aggregating with a dict

In [594]:
tsdf.agg({"A": "mean", "B": "sum"})

A    0.147266
B    2.437155
dtype: float64

In [595]:
tsdf.agg({"A": ["mean", "min"], "B": "sum"})

Unnamed: 0,A,B
mean,0.147266,
min,-1.422656,
sum,,2.437155


### Transform API

In [596]:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=10),
)

In [597]:
tsdf.iloc[3:7] = np.nan

In [598]:
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2000-01-01,0.883127,1.770707,0.393395
2000-01-02,0.546605,1.940649,2.027397
2000-01-03,0.911528,0.382833,0.024728
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.979027,0.069452,0.675468
2000-01-09,0.755077,0.654629,0.026828
2000-01-10,0.726856,0.852822,1.066967


In [599]:
tsdf.transform("abs")

Unnamed: 0,A,B,C
2000-01-01,0.883127,1.770707,0.393395
2000-01-02,0.546605,1.940649,2.027397
2000-01-03,0.911528,0.382833,0.024728
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.979027,0.069452,0.675468
2000-01-09,0.755077,0.654629,0.026828
2000-01-10,0.726856,0.852822,1.066967


In [600]:
tsdf.transform(lambda x: x.abs())

Unnamed: 0,A,B,C
2000-01-01,0.883127,1.770707,0.393395
2000-01-02,0.546605,1.940649,2.027397
2000-01-03,0.911528,0.382833,0.024728
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.979027,0.069452,0.675468
2000-01-09,0.755077,0.654629,0.026828
2000-01-10,0.726856,0.852822,1.066967


In [601]:
np.abs(tsdf)

Unnamed: 0,A,B,C
2000-01-01,0.883127,1.770707,0.393395
2000-01-02,0.546605,1.940649,2.027397
2000-01-03,0.911528,0.382833,0.024728
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.979027,0.069452,0.675468
2000-01-09,0.755077,0.654629,0.026828
2000-01-10,0.726856,0.852822,1.066967


### Transform with multiple functions

In [602]:
tsdf.transform([np.abs, lambda x: x + 1])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
2000-01-01,0.883127,1.883127,1.770707,2.770707,0.393395,0.606605
2000-01-02,0.546605,1.546605,1.940649,-0.940649,2.027397,-1.027397
2000-01-03,0.911528,0.088472,0.382833,1.382833,0.024728,1.024728
2000-01-04,,,,,,
2000-01-05,,,,,,
2000-01-06,,,,,,
2000-01-07,,,,,,
2000-01-08,1.979027,-0.979027,0.069452,0.930548,0.675468,1.675468
2000-01-09,0.755077,0.244923,0.654629,0.345371,0.026828,0.973172
2000-01-10,0.726856,1.726856,0.852822,0.147178,1.066967,2.066967


In [603]:
tsdf["A"].transform([np.abs, lambda x: x + 1])

Unnamed: 0,absolute,<lambda>
2000-01-01,0.883127,1.883127
2000-01-02,0.546605,1.546605
2000-01-03,0.911528,0.088472
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.979027,-0.979027
2000-01-09,0.755077,0.244923
2000-01-10,0.726856,1.726856


### Transforming with a dict

In [604]:
tsdf.transform({"A": np.abs, "B": lambda x: x + 1})

Unnamed: 0,A,B
2000-01-01,0.883127,2.770707
2000-01-02,0.546605,-0.940649
2000-01-03,0.911528,1.382833
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.979027,0.930548
2000-01-09,0.755077,0.345371
2000-01-10,0.726856,0.147178


### Applying elementwise functions

In [605]:
df4 = df.copy()

df4

Unnamed: 0,one,two,three
a,0.927875,-0.322818,
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412
d,,0.242219,0.859096


In [606]:
def f(x):
    return len(str(x))

In [607]:
df4["one"].map(f)

a    18
b    19
c    21
d     3
Name: one, dtype: int64

In [608]:
df4.map(f)

Unnamed: 0,one,two,three
a,18,18,3
b,19,19,18
c,21,19,18
d,3,19,18


### Reindexing and altering labels

In [609]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

s

a    0.911546
b   -0.191905
c   -1.020426
d   -0.501203
e   -0.694681
dtype: float64

In [610]:
s.reindex(["e", "b", "f", "d"])

e   -0.694681
b   -0.191905
f         NaN
d   -0.501203
dtype: float64

In [611]:
df

Unnamed: 0,one,two,three
a,0.927875,-0.322818,
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412
d,,0.242219,0.859096


In [612]:
df.reindex(index=["c", "f", "b"], columns=["three", "two", "one"])

Unnamed: 0,three,two,one
c,1.655412,-0.952517,-0.038225
f,,,
b,0.767147,-1.362655,-1.068322


In [613]:
rs = s.reindex(df.index)

In [614]:
rs.index is df.index

True

### Reindexing to align with another object

In [615]:
df2 = df.reindex(["a", "b", "c"], columns=["one", "two"])

df3 = df2 - df2.mean()

df2

Unnamed: 0,one,two
a,0.927875,-0.322818
b,-1.068322,-1.362655
c,-0.038225,-0.952517


In [616]:
df3

Unnamed: 0,one,two
a,0.987432,0.556512
b,-1.008765,-0.483326
c,0.021332,-0.073187


In [617]:
df.reindex_like(df2)

Unnamed: 0,one,two
a,0.927875,-0.322818
b,-1.068322,-1.362655
c,-0.038225,-0.952517


### Dropping labels from an axis

In [618]:
df.drop(["a", "d"], axis=0)

Unnamed: 0,one,two,three
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412


In [619]:
df.drop(["one"], axis=1)

Unnamed: 0,two,three
a,-0.322818,
b,-1.362655,0.767147
c,-0.952517,1.655412
d,0.242219,0.859096


In [620]:
df.reindex(df.index.difference(["a", "d"]))

Unnamed: 0,one,two,three
b,-1.068322,-1.362655,0.767147
c,-0.038225,-0.952517,1.655412


# <span style="color:Orange">IO tools (CSV, text, HDF5, …)</span>
### CSV & text files

In [621]:
import pandas as pd

from io import StringIO

data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [622]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [623]:
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

Unnamed: 0,col1,col2,col3
0,a,b,2


### Specifying column data types

In [624]:
data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"

print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [625]:
df = pd.read_csv(StringIO(data), dtype=object)

df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [626]:
df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"})

df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

### Specifying categorical dtype

In [627]:
pd.read_csv(StringIO(data), dtype="category").dtypes

a    category
b    category
c    category
d    category
dtype: object

In [628]:
pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes


a      int64
b      int64
c      int64
d    float64
dtype: object

In [629]:
from pandas.api.types import CategoricalDtype

dtype = CategoricalDtype(["d", "c", "b", "a"], ordered=True)

pd.read_csv(StringIO(data), dtype={"col1": dtype}).dtypes


a      int64
b      int64
c      int64
d    float64
dtype: object

In [630]:
dtype = CategoricalDtype(["a", "b", "d"])  # No 'c'

pd.read_csv(StringIO(data), dtype={"col1": dtype})

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


### Naming and using columns

In [631]:
data = "a,b,c\n1,2,3\n4,5,6\n7,8,9"

print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [632]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0)

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6
2,7,8,9


In [633]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None)

Unnamed: 0,foo,bar,baz
0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [634]:
data = "skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9"

pd.read_csv(StringIO(data), header=1)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


### Duplicate names parsing

In [635]:
data = "a,b,a\n0,1,2\n3,4,5"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,a.1
0,0,1,2
1,3,4,5


### Filtering columns (usecols)

In [636]:
data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c,d
0,1,2,3,foo
1,4,5,6,bar
2,7,8,9,baz


In [637]:
pd.read_csv(StringIO(data), usecols=["b", "d"])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [638]:
pd.read_csv(StringIO(data), usecols=[0, 2, 3])

Unnamed: 0,a,c,d
0,1,3,foo
1,4,6,bar
2,7,9,baz


In [639]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["A", "C"])

Unnamed: 0,a,c
0,1,3
1,4,6
2,7,9


In [640]:
pd.read_csv(StringIO(data), usecols=lambda x: x not in ["a", "c"])


Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


### Comments and empty lines

In [641]:
data = "\na,b,c\n  \n# commented line\n1,2,3\n\n4,5,6"

print(data)


a,b,c
  
# commented line
1,2,3

4,5,6


In [642]:
data = "a,b,c\n\n1,2,3\n\n\n4,5,6"

pd.read_csv(StringIO(data), skip_blank_lines=False)

Unnamed: 0,a,b,c
0,,,
1,1.0,2.0,3.0
2,,,
3,,,
4,4.0,5.0,6.0


In [643]:
data = "#comment\na,b,c\nA,B,C\n1,2,3"

pd.read_csv(StringIO(data), comment="#", header=1)

Unnamed: 0,A,B,C
0,1,2,3


In [644]:
data = (
    "# empty\n"
    "# second empty line\n"
    "# third emptyline\n"
    "X,Y,Z\n"
    "1,2,3\n"
    "A,B,C\n"
    "1,2.,4.\n"
    "5.,NaN,10.0\n"
)
print(data)

# empty
# second empty line
# third emptyline
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0



In [645]:
pd.read_csv(StringIO(data), comment="#", skiprows=4, header=1)

Unnamed: 0,A,B,C
0,1.0,2.0,4.0
1,5.0,,10.0


In [646]:
data = (
    "ID,level,category\n"
    "Patient1,123000,x # really unpleasant\n"
    "Patient2,23000,y # wouldn't take his medicine\n"
    "Patient3,1234018,z # awesome"
)


with open("tmp.csv", "w") as fh:
    fh.write(data)


print(open("tmp.csv").read())

ID,level,category
Patient1,123000,x # really unpleasant
Patient2,23000,y # wouldn't take his medicine
Patient3,1234018,z # awesome


### Dealing with Unicode data

In [647]:
from io import BytesIO

data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5"

data = data.decode("utf8").encode("latin-1")

df = pd.read_csv(BytesIO(data), encoding="latin-1")

df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


### Date Handling

### Specifying date columns

In [648]:
with open("foo.csv", mode="w") as f:
    f.write("date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5")


# Use a column as an index, and parse it as dates.
df = pd.read_csv("foo.csv", index_col=0, parse_dates=True)

df

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


### Inferring datetime format


In [649]:
f = pd.read_csv(
    "foo.csv",
    index_col=0,
    parse_dates=True,
)

In [650]:
df

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [651]:
data = StringIO("date\n12 Jan 2000\n2000-01-13\n")

df = pd.read_csv(data)

df['date'] = pd.to_datetime(df['date'], format='mixed')

df

Unnamed: 0,date
0,2000-01-12
1,2000-01-13


In [652]:
data = StringIO("date\n2020-01-01\n2020-01-01 03:00\n")

df = pd.read_csv(data)

df['date'] = pd.to_datetime(df['date'], format='ISO8601')

df

Unnamed: 0,date
0,2020-01-01 00:00:00
1,2020-01-01 03:00:00


### International date formats

In [653]:
data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c"

print(data)

date,value,cat
1/6/2000,5,a
2/6/2000,10,b
3/6/2000,15,c


In [654]:
with open("tmp.csv", "w") as fh:
    fh.write(data)


pd.read_csv("tmp.csv", parse_dates=[0])

Unnamed: 0,date,value,cat
0,2000-01-06,5,a
1,2000-02-06,10,b
2,2000-03-06,15,c


In [655]:
pd.read_csv("tmp.csv", dayfirst=True, parse_dates=[0])

Unnamed: 0,date,value,cat
0,2000-06-01,5,a
1,2000-06-02,10,b
2,2000-06-03,15,c


### Writing CSVs to binary file objects

In [656]:
import io

data = pd.DataFrame([0, 1, 2])

buffer = io.BytesIO()

data.to_csv(buffer, encoding="utf-8", compression="gzip")

### Specifying method for floating-point conversion

In [657]:
val = "0.3066101993807095471566981359501369297504425048828125"

data = "a,b,c\n1,2,{0}".format(val)

abs(
    pd.read_csv(
        StringIO(data),
        engine="c",
        float_precision=None,
    )["c"][0] - float(val)
)

Out[145]: 5.551115123125783e-17

abs(
    pd.read_csv(
        StringIO(data),
        engine="c",
        float_precision="high",
    )["c"][0] - float(val)
)

Out[146]: 5.551115123125783e-17

abs(
    pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0]
    - float(val)
)

0.0

### Thousand separators

In [658]:
ata = (
    "ID|level|category\n"
    "Patient1|123,000|x\n"
    "Patient2|23,000|y\n"
    "Patient3|1,234,018|z"
)


with open("tmp.csv", "w") as fh:
    fh.write(data)


df = pd.read_csv("tmp.csv", sep="|")

df

Unnamed: 0,"a,b,c"
0,"1,2,0.3066101993807095471566981359501369297504..."


In [659]:
df = pd.read_csv("tmp.csv", sep="|", thousands=",")

df

Unnamed: 0,"a,b,c"
0,120.30661


### Boolean values

In [660]:
data = "a,b,c\n1,Yes,2\n3,No,4"

print(data)

a,b,c
1,Yes,2
3,No,4


In [661]:
pd.read_csv(StringIO(data), true_values=["Yes"], false_values=["No"])

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4


### Indexes

In [662]:
data = "A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5"

print(data)

A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [663]:
with open("foo.csv", "w") as f:
    f.write(data)

### Reading multiple files to create a single DataFrame

### Iterating through files chunk by chunk

In [664]:
f = pd.DataFrame(np.random.randn(10, 4))

df.to_csv("tmp.csv", index=False)

table = pd.read_csv("tmp.csv")

table

Unnamed: 0,"a,b,c"
0,120.30661


In [665]:
with pd.read_csv("tmp.csv", chunksize=4) as reader:
    print(reader)
    for chunk in reader:
        print(chunk)

<pandas.io.parsers.readers.TextFileReader object at 0x000001F0BA9F7A90>
       a,b,c
0  120.30661


In [666]:
with pd.read_csv("tmp.csv", iterator=True) as reader:
    print(reader.get_chunk(5))


       a,b,c
0  120.30661


### Writing JSON

### Reading JSON

In [667]:
from io import StringIO

x= pd.read_json(StringIO(json))
x

Unnamed: 0,A,B
0,1.067983,-0.599627
1,0.026057,0.469738
2,-0.42916,-0.791922
3,0.334758,0.795126
4,0.422559,1.681068


In [668]:
dfj = pd.DataFrame(np.random.randn(5, 2), columns=list("AB"))

json = dfj.to_json()
json

'{"A":{"0":-0.3117274391,"1":0.5878692275,"2":-0.154671772,"3":-0.6529450768,"4":0.5961770655},"B":{"0":0.14459865,"1":0.3601155697,"2":-0.328597073,"3":-0.9108803855,"4":0.2350045525}}'

### Orient options

In [669]:
dfjo = pd.DataFrame(
    dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)),
    columns=list("ABC"),
    index=list("xyz"),
)


dfjo

Unnamed: 0,A,B,C
x,1,4,7
y,2,5,8
z,3,6,9


In [670]:
sjo = pd.Series(dict(x=15, y=16, z=17), name="D")
sjo


x    15
y    16
z    17
Name: D, dtype: int64

### Reading HTML content

In [671]:
url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list"
pd.read_html(url)

[                             Bank NameBank       CityCity StateSt  CertCert  \
 0    Republic First Bank dba Republic Bank   Philadelphia      PA     27332   
 1                            Citizens Bank       Sac City      IA      8758   
 2                 Heartland Tri-State Bank        Elkhart      KS     25851   
 3                      First Republic Bank  San Francisco      CA     59017   
 4                           Signature Bank       New York      NY     57053   
 ..                                     ...            ...     ...       ...   
 564                     Superior Bank, FSB       Hinsdale      IL     32646   
 565                    Malta National Bank          Malta      OH      6629   
 566        First Alliance Bank & Trust Co.     Manchester      NH     34264   
 567      National State Bank of Metropolis     Metropolis      IL      3815   
 568                       Bank of Honolulu       Honolulu      HI     21029   
 
                  Acquiring Institutio

### Reading XML

In [672]:
from io import StringIO

xml = """<?xml version="1.0" encoding="UTF-8"?>
<bookstore>
  <book category="cooking">
    <title lang="en">Everyday Italian</title>
    <author>Giada De Laurentiis</author>
    <year>2005</year>
    <price>30.00</price>
  </book>
  <book category="children">
    <title lang="en">Harry Potter</title>
    <author>J K. Rowling</author>
    <year>2005</year>
    <price>29.99</price>
  </book>
  <book category="web">
    <title lang="en">Learning XML</title>
    <author>Erik T. Ray</author>
    <year>2003</year>
    <price>39.95</price>
  </book>
</bookstore>"""


df = pd.read_xml(StringIO(xml))

df

Unnamed: 0,category,title,author,year,price
0,cooking,Everyday Italian,Giada De Laurentiis,2005,30.0
1,children,Harry Potter,J K. Rowling,2005,29.99
2,web,Learning XML,Erik T. Ray,2003,39.95


# <span style="color:Orange">PyArrow Functionality</span>
### Data Structure Integration

In [673]:
ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")

ser

0    -1.5
1     0.2
2    <NA>
dtype: float[pyarrow]

In [674]:
idx = pd.Index([True, None], dtype="bool[pyarrow]")

idx

Index([True, <NA>], dtype='bool[pyarrow]')

In [675]:
df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")

df

Unnamed: 0,0,1
0,1,2
1,3,4


In [676]:
import pyarrow as pa

data = list("abc")

ser_sd = pd.Series(data, dtype="string[pyarrow]")

ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))

ser_ad.dtype == ser_sd.dtype

False

In [677]:
ser_sd.str.contains("a")

0     True
1    False
2    False
dtype: boolean

In [678]:
ser_ad.str.contains("a")

0     True
1    False
2    False
dtype: bool[pyarrow]

In [679]:
import pyarrow as pa

list_str_type = pa.list_(pa.string())

ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))

ser

0    ['hello']
1    ['there']
dtype: list<item: string>[pyarrow]

In [680]:
from datetime import time

idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))

idx

Index([12:30:00, <NA>], dtype='time64[us][pyarrow]')

In [681]:
from decimal import Decimal

decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))

data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]

df = pd.DataFrame(data, dtype=decimal_type)

df

Unnamed: 0,0,1
0,3.19,
1,,-1.23


In [682]:
pa_array = pa.array(
    [{"1": "2"}, {"10": "20"}, None],
    type=pa.map_(pa.string(), pa.string()),
)


ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))

ser

0      [('1', '2')]
1    [('10', '20')]
2              <NA>
dtype: map<string, string>[pyarrow]

In [683]:
ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")

pa.array(ser)

<pyarrow.lib.UInt8Array object at 0x000001F0B5534D00>
[
  1,
  2,
  null
]

In [684]:
idx = pd.Index(ser)

pa.array(idx)

<pyarrow.lib.UInt8Array object at 0x000001F0BAE14100>
[
  1,
  2,
  null
]

In [685]:
table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])

df = table.to_pandas(types_mapper=pd.ArrowDtype)

df

Unnamed: 0,a
0,1
1,2
2,3


### I/O Reading

In [686]:
data = io.StringIO("""a,b,c
   1,2.5,True
   3,4.5,False
""")


df = pd.read_csv(data, engine="pyarrow")

df

Unnamed: 0,a,b,c
0,1,2.5,True
1,3,4.5,False


In [687]:
import io

data = io.StringIO("""a,b,c,d,e,f,g,h,i
    1,2.5,True,a,,,,,
    3,4.5,False,b,6,7.5,True,a,
""")


df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")

df_pyarrow.dtypes
 

a     int64[pyarrow]
b    double[pyarrow]
c      bool[pyarrow]
d    string[pyarrow]
e     int64[pyarrow]
f    double[pyarrow]
g      bool[pyarrow]
h    string[pyarrow]
i      null[pyarrow]
dtype: object

# <span style="color:Orange">Indexing and selecting data</span>


In [688]:
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])


df

Unnamed: 0,A,B,C,D
2000-01-01,0.130472,-0.505058,-0.045706,0.787586
2000-01-02,-0.623551,0.318681,-1.45823,1.452601
2000-01-03,0.342868,-1.287808,0.70074,-1.063234
2000-01-04,-0.417758,-0.465326,0.864418,-0.98811
2000-01-05,-0.734779,0.350336,-0.21052,-0.50074
2000-01-06,0.220927,-0.283938,0.308001,-0.092234
2000-01-07,0.86117,-0.627728,-1.218366,0.140514
2000-01-08,0.618305,0.700115,-1.134377,-0.522459


In [689]:
s = df['A']

s[dates[5]]

0.22092689646980593

In [690]:
df

Unnamed: 0,A,B,C,D
2000-01-01,0.130472,-0.505058,-0.045706,0.787586
2000-01-02,-0.623551,0.318681,-1.45823,1.452601
2000-01-03,0.342868,-1.287808,0.70074,-1.063234
2000-01-04,-0.417758,-0.465326,0.864418,-0.98811
2000-01-05,-0.734779,0.350336,-0.21052,-0.50074
2000-01-06,0.220927,-0.283938,0.308001,-0.092234
2000-01-07,0.86117,-0.627728,-1.218366,0.140514
2000-01-08,0.618305,0.700115,-1.134377,-0.522459


In [691]:
df[['B', 'A']] = df[['A', 'B']]

In [692]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.505058,0.130472,-0.045706,0.787586
2000-01-02,0.318681,-0.623551,-1.45823,1.452601
2000-01-03,-1.287808,0.342868,0.70074,-1.063234
2000-01-04,-0.465326,-0.417758,0.864418,-0.98811
2000-01-05,0.350336,-0.734779,-0.21052,-0.50074
2000-01-06,-0.283938,0.220927,0.308001,-0.092234
2000-01-07,-0.627728,0.86117,-1.218366,0.140514
2000-01-08,0.700115,0.618305,-1.134377,-0.522459


In [693]:
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,-0.505058,0.130472
2000-01-02,0.318681,-0.623551
2000-01-03,-1.287808,0.342868
2000-01-04,-0.465326,-0.417758
2000-01-05,0.350336,-0.734779
2000-01-06,-0.283938,0.220927
2000-01-07,-0.627728,0.86117
2000-01-08,0.700115,0.618305


In [694]:
df.loc[:, ['B', 'A']] = df[['A', 'B']]

df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,-0.505058,0.130472
2000-01-02,0.318681,-0.623551
2000-01-03,-1.287808,0.342868
2000-01-04,-0.465326,-0.417758
2000-01-05,0.350336,-0.734779
2000-01-06,-0.283938,0.220927
2000-01-07,-0.627728,0.86117
2000-01-08,0.700115,0.618305


In [695]:
df.loc[:, ['B', 'A']] = df[['A', 'B']].to_numpy()

df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,0.130472,-0.505058
2000-01-02,-0.623551,0.318681
2000-01-03,0.342868,-1.287808
2000-01-04,-0.417758,-0.465326
2000-01-05,-0.734779,0.350336
2000-01-06,0.220927,-0.283938
2000-01-07,0.86117,-0.627728
2000-01-08,0.618305,0.700115


In [696]:
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,0.130472,-0.505058
2000-01-02,-0.623551,0.318681
2000-01-03,0.342868,-1.287808
2000-01-04,-0.417758,-0.465326
2000-01-05,-0.734779,0.350336
2000-01-06,0.220927,-0.283938
2000-01-07,0.86117,-0.627728
2000-01-08,0.618305,0.700115


In [697]:
df.iloc[:, [1, 0]] = df[['A', 'B']]

df[['A','B']]

Unnamed: 0,A,B
2000-01-01,-0.505058,0.130472
2000-01-02,0.318681,-0.623551
2000-01-03,-1.287808,0.342868
2000-01-04,-0.465326,-0.417758
2000-01-05,0.350336,-0.734779
2000-01-06,-0.283938,0.220927
2000-01-07,-0.627728,0.86117
2000-01-08,0.700115,0.618305


### Attribute access

In [698]:
sa = pd.Series([1, 2, 3], index=list('abc'))

dfa = df.copy()

In [699]:
sa.b


dfa.A

2000-01-01   -0.505058
2000-01-02    0.318681
2000-01-03   -1.287808
2000-01-04   -0.465326
2000-01-05    0.350336
2000-01-06   -0.283938
2000-01-07   -0.627728
2000-01-08    0.700115
Freq: D, Name: A, dtype: float64

In [700]:
sa.a = 5

sa

a    5
b    2
c    3
dtype: int64

In [701]:
dfa.A = list(range(len(dfa.index)))  # ok if A already exists

dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,0.130472,-0.045706,0.787586
2000-01-02,1,-0.623551,-1.45823,1.452601
2000-01-03,2,0.342868,0.70074,-1.063234
2000-01-04,3,-0.417758,0.864418,-0.98811
2000-01-05,4,-0.734779,-0.21052,-0.50074
2000-01-06,5,0.220927,0.308001,-0.092234
2000-01-07,6,0.86117,-1.218366,0.140514
2000-01-08,7,0.618305,-1.134377,-0.522459


In [702]:
dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column

dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,0.130472,-0.045706,0.787586
2000-01-02,1,-0.623551,-1.45823,1.452601
2000-01-03,2,0.342868,0.70074,-1.063234
2000-01-04,3,-0.417758,0.864418,-0.98811
2000-01-05,4,-0.734779,-0.21052,-0.50074
2000-01-06,5,0.220927,0.308001,-0.092234
2000-01-07,6,0.86117,-1.218366,0.140514
2000-01-08,7,0.618305,-1.134377,-0.522459


In [703]:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})

x.iloc[1] = {'x': 9, 'y': 99}

x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


### Slicing ranges

In [704]:
s[:5]

2000-01-01    0.130472
2000-01-02   -0.623551
2000-01-03    0.342868
2000-01-04   -0.417758
2000-01-05   -0.734779
Freq: D, Name: A, dtype: float64

In [705]:
s[::2]

2000-01-01    0.130472
2000-01-03    0.342868
2000-01-05   -0.734779
2000-01-07    0.861170
Freq: 2D, Name: A, dtype: float64

In [706]:
s[::-1]

2000-01-08    0.618305
2000-01-07    0.861170
2000-01-06    0.220927
2000-01-05   -0.734779
2000-01-04   -0.417758
2000-01-03    0.342868
2000-01-02   -0.623551
2000-01-01    0.130472
Freq: -1D, Name: A, dtype: float64

In [707]:
s2 = s.copy()

s2[:5] = 0

s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06    0.220927
2000-01-07    0.861170
2000-01-08    0.618305
Freq: D, Name: A, dtype: float64

In [708]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,-0.505058,0.130472,-0.045706,0.787586
2000-01-02,0.318681,-0.623551,-1.45823,1.452601
2000-01-03,-1.287808,0.342868,0.70074,-1.063234


### Selection by label

In [709]:
dfl = pd.DataFrame(np.random.randn(5, 4),
                   columns=list('ABCD'),
                   index=pd.date_range('20130101', periods=5))


dfl

Unnamed: 0,A,B,C,D
2013-01-01,1.17948,-0.615456,0.346524,-0.087651
2013-01-02,1.1713,-0.04705,0.655626,-1.494851
2013-01-03,-0.763844,1.308917,0.472653,-0.469593
2013-01-04,-1.615178,0.80294,-0.818542,-0.16804
2013-01-05,-0.074706,-0.604216,-0.312875,-0.664358


In [710]:
dfl.loc['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.1713,-0.04705,0.655626,-1.494851
2013-01-03,-0.763844,1.308917,0.472653,-0.469593
2013-01-04,-1.615178,0.80294,-0.818542,-0.16804


In [711]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))

s1

a    1.087033
b   -0.825849
c    1.589464
d    1.812534
e   -0.694051
f    0.582492
dtype: float64

In [712]:
s1.loc['c':]

c    1.589464
d    1.812534
e   -0.694051
f    0.582492
dtype: float64

### Selection by callable

In [713]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))


df1

Unnamed: 0,A,B,C,D
a,-0.679233,-0.382162,1.898171,-0.071243
b,0.492773,0.774853,1.152391,-0.993689
c,0.063972,0.302976,0.586764,-2.13249
d,1.180339,-1.805908,1.673401,-0.892826
e,-0.89677,0.376439,1.237525,2.491218
f,0.147878,0.404577,0.806047,-0.092976


In [714]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
b,0.492773,0.774853,1.152391,-0.993689
c,0.063972,0.302976,0.586764,-2.13249
d,1.180339,-1.805908,1.673401,-0.892826
f,0.147878,0.404577,0.806047,-0.092976


In [715]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,-0.679233,-0.382162
b,0.492773,0.774853
c,0.063972,0.302976
d,1.180339,-1.805908
e,-0.89677,0.376439
f,0.147878,0.404577


In [716]:
df1.iloc[:, lambda df: [0, 1]]

Unnamed: 0,A,B
a,-0.679233,-0.382162
b,0.492773,0.774853
c,0.063972,0.302976
d,1.180339,-1.805908
e,-0.89677,0.376439
f,0.147878,0.404577


In [717]:
df1[lambda df: df.columns[0]]

a   -0.679233
b    0.492773
c    0.063972
d    1.180339
e   -0.896770
f    0.147878
Name: A, dtype: float64

In [718]:
df1['A'].loc[lambda s: s > 0]

b    0.492773
c    0.063972
d    1.180339
f    0.147878
Name: A, dtype: float64

### Combining positional and label-based indexing

In [719]:
dfd = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]},
                   index=list('abc'))


dfd

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [720]:
dfd.loc[dfd.index[[0, 2]], 'A']

a    1
c    3
Name: A, dtype: int64

In [721]:
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]

a    1
c    3
Name: A, dtype: int64

In [722]:
dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]

Unnamed: 0,A,B
a,1,4
c,3,6


### Reindexing

In [723]:
s = pd.Series([1, 2, 3])

s.reindex([1, 2, 3])

1    2.0
2    3.0
3    NaN
dtype: float64

In [724]:
labels = [1, 2, 3]

s.loc[s.index.intersection(labels)]

1    2
2    3
dtype: int64

In [725]:
s.loc[s.index.intersection(labels)].reindex(labels)

1    2.0
2    3.0
3    NaN
dtype: float64

### Selecting random samples

In [726]:
s = pd.Series([0, 1, 2, 3, 4, 5])

# When no arguments are passed, returns 1 row.
s.sample()

5    5
dtype: int64

In [727]:
# One may specify either a number of rows:
s.sample(n=3)

0    0
5    5
2    2
dtype: int64

In [728]:
# Or a fraction of the rows:
s.sample(frac=0.5)

5    5
4    4
2    2
dtype: int64

In [729]:
s = pd.Series([0, 1, 2, 3, 4, 5])

# Without replacement (default):
s.sample(n=6, replace=False)

5    5
4    4
2    2
0    0
3    3
1    1
dtype: int64

In [730]:
s = pd.Series([0, 1, 2, 3, 4, 5])

example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]

s.sample(n=3, weights=example_weights)

5    5
3    3
2    2
dtype: int64

In [731]:
# Weights will be re-normalized automatically
example_weights2 = [0.5, 0, 0, 0, 0, 0]
s.sample(n=1, weights=example_weights2)

0    0
dtype: int64

In [732]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6],
                    'weight_column': [0.5, 0.4, 0.1, 0]})


df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
0,9,0.5
1,8,0.4
2,7,0.1


In [733]:
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})

df3.sample(n=1, axis=1)

Unnamed: 0,col1
0,1
1,2
2,3


In [734]:
df4 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})

# With a given seed, the sample will always draw the same rows.
df4.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


In [735]:
df4.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


# <span style="color:Orange">Supervison Tasks</span>


### Create a Pandas Series from a Python list, numpy array, and a dictionary.


In [736]:
import pandas as pd
import numpy as np

series_from_list, series_from_array, series_from_dict = pd.Series([10, 20, 30, 40]), pd.Series(np.array([50, 60, 70, 80])), pd.Series({'a': 90, 'b': 100, 'c': 110, 'd': 120})

print("Series from list:\n", series_from_list, "\n\nSeries from numpy array:\n", series_from_array, "\n\nSeries from dictionary:\n", series_from_dict)


Series from list:
 0    10
1    20
2    30
3    40
dtype: int64 

Series from numpy array:
 0    50
1    60
2    70
3    80
dtype: int32 

Series from dictionary:
 a     90
b    100
c    110
d    120
dtype: int64


### Assign a custom index to the Series

In [737]:
series_with_custom_index = pd.Series([10, 20, 30, 40], index=['w', 'x', 'y', 'z'])
print("\nSeries with custom index:\n", series_with_custom_index)


Series with custom index:
 w    10
x    20
y    30
z    40
dtype: int64


###  Perform basic arithmetic operations on Series

In [738]:
sum_series = series_from_list + series_from_array
mult_series = series_from_list * 2
print("\nSum of two Series:\n", sum_series)
print("\nMultiplication of a Series by 2:\n", mult_series)


Sum of two Series:
 0     60
1     80
2    100
3    120
dtype: int64

Multiplication of a Series by 2:
 0    20
1    40
2    60
3    80
dtype: int64


###  Access elements using index labels and positions

In [739]:
series = pd.Series([10, 20, 30, 40], index=['w', 'x', 'y', 'z'])

label_element = series['x']
position_element = series.iloc[2]

print(label_element)
print(position_element)


20
30


### Filter the Series to include only values greater than a specific threshold.

In [740]:
series = pd.Series([10, 20, 30, 40, 50])
threshold = 30
filtered_series = series[series > threshold]
print(filtered_series)

3    40
4    50
dtype: int64


###  Create a DataFrame from a dictionary of lists.


In [741]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']}

df = pd.DataFrame(data)

print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston


###  Create a DataFrame from a numpy array, specifying column and index names.


In [742]:
array_data = np.array([[10, 20, 30], [40, 50, 60], [70, 80, 90]])
columns = ['A', 'B', 'C']
index = ['X', 'Y', 'Z']

df = pd.DataFrame(array_data, columns=columns, index=index)

print(df)


    A   B   C
X  10  20  30
Y  40  50  60
Z  70  80  90


### Load a DataFrame from a CSV file.


In [743]:

file_path = r'C:\Users\Ubaid-khan\Desktop\bank-full.csv'
df = pd.read_csv(file_path, delimiter=';')

df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [744]:
print(df.head())
print("......................")
print(df.tail())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
......................
       age           job   marital  education default  balance housing loan  \
45206   5

### Get a summary of the DataFrame including the mean, median, and standard deviation of numeric columns.


In [745]:
summary = df.describe()
summary

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


### Extract a specific column as a Series.

In [746]:
specific_column = df['balance']  # Replace 'column_name' with the actual column name you want to extract

print(specific_column)

0        2143
1          29
2           2
3        1506
4           1
         ... 
45206     825
45207    1729
45208    5715
45209     668
45210    2971
Name: balance, Length: 45211, dtype: int64


### Filter rows based on column values.


In [747]:
filtered_df = df[df['age'] > 30]
print(filtered_df)

       age           job   marital  education default  balance housing loan  \
0       58    management   married   tertiary      no     2143     yes   no   
1       44    technician    single  secondary      no       29     yes   no   
2       33  entrepreneur   married  secondary      no        2     yes  yes   
3       47   blue-collar   married    unknown      no     1506     yes   no   
4       33       unknown    single    unknown      no        1      no   no   
...    ...           ...       ...        ...     ...      ...     ...  ...   
45206   51    technician   married   tertiary      no      825      no   no   
45207   71       retired  divorced    primary      no     1729      no   no   
45208   72       retired   married  secondary      no     5715      no   no   
45209   57   blue-collar   married  secondary      no      668      no   no   
45210   37  entrepreneur   married  secondary      no     2971      no   no   

         contact  day month  duration  campaign  pd

### Select rows based on multiple conditions.


In [748]:
filtered_df = df[(df['age'] > 30) & (df['balance'] > 5000)]
print(filtered_df)

       age          job  marital  education default  balance housing loan  \
34      51   management  married   tertiary      no    10635     yes   no   
65      51   management  married   tertiary      no     6530     yes   no   
69      35  blue-collar   single  secondary      no    12223     yes  yes   
70      57  blue-collar  married  secondary      no     5935     yes  yes   
205     50     services  married  secondary      no     5699     yes   no   
...    ...          ...      ...        ...     ...      ...     ...  ...   
45118   78      retired  married    primary      no    14204      no   no   
45127   67  blue-collar  married  secondary      no    16353      no   no   
45130   33   technician  married  secondary      no     5083      no   no   
45181   46  blue-collar  married  secondary      no     6879      no   no   
45208   72      retired  married  secondary      no     5715      no   no   

        contact  day month  duration  campaign  pdays  previous poutcome   

### Add a new column to the DataFrame.



In [749]:
df['new_column'] = 'default_value'
print(df.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  \
0  unknown    5   may       261         1     -1         0  unknown  no   
1  unknown    5   may       151         1     -1         0  unknown  no   
2  unknown    5   may        76         1     -1         0  unknown  no   
3  unknown    5   may        92         1     -1         0  unknown  no   
4  unknown    5   may       198         1     -1         0  unknown  no   

      new_column  
0  default_value  
1  default_value  
2  default_value  
3  default_value  
4  defaul

### Delete a column from the DataFrame.


In [750]:
df = df.drop(columns=['new_column'])
print(df.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


### Rename columns in the DataFrame.

In [751]:
df = df.rename(columns={'job': 'JOB'})
print(df.head())


   age           JOB  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
